/*******************************************************************************
*                                                                              *
*   (C) 1997-2021 by Ernst W. Mayer.                                           *
*                                                                              *
*  This program is free software; you can redistribute it and/or modify it     *
*  under the terms of the GNU General Public License as published by the       *
*  Free Software Foundation; either version 2 of the License, or (at your      *
*  option) any later version.                                                  *
*                                                                              *
*  This program is distributed in the hope that it will be useful, but WITHOUT *
*  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or       *
*  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for   *
*  more details.                                                               *
*                                                                              *
*  You should have received a copy of the GNU General Public License along     *
*  with this program; see the file GPL.txt.  If not, you may view one at       *
*  http://www.fsf.org/licenses/licenses.html, or obtain one by writing to the  *
*  Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA     *
*  02111-1307, USA.                                                            *
*                                                                              *
*******************************************************************************/

/*******************************************************************************
   We now include this header file if it was not included before.
*******************************************************************************/
#ifndef sse2_macro_gcc_h_included
#define sse2_macro_gcc_h_included

#ifdef USE_ARM_V8_SIMD

	#define PAIR_SQUARE_4_SSE2(XtAr, XtBr, XtCr, XtDr, Xc, Xs, Xforth)\
	{\
	__asm__ volatile (\
		/*   calculate cross-product terms...
			__rt=__tAr* ~tDr+__tAi* ~tDi; __rt=__rt+__rt;
			__it=__tAi* ~tDr-__tAr* ~tDi; __it=__it+__it;
		*/\
		"ldr	x3,%[__tDr]	\n\t"\
		"ldr	x0,%[__tAr]	\n\t"\
		"ldp	q6,q7,[x3]	\n\t"/* tDr,i */\
		"ldp	q4,q5,[x0]	\n\t"/* tAr,i */\
		"ext v6.16b,v6.16b,v6.16b,#8	\n\t"/* ~tDr */\
		"ext v7.16b,v7.16b,v7.16b,#8	\n\t"/* ~tDi */\
		"fmul	v0.2d,v4.2d,v6.2d	\n\t"/* tAr*~tDr */\
		"fmul	v1.2d,v5.2d,v6.2d	\n\t"/* tAi*~tDr */\
		"fmla	v0.2d,v5.2d,v7.2d	\n\t"/* rt = tAr*~tDr + tAi*~tDi */\
		"fmls	v1.2d,v4.2d,v7.2d	\n\t"/* it = tAi*~tDr - tAr*~tDi */\
		"fadd	v0.2d,v0.2d,v0.2d	\n\t"/* rt=rt+rt */\
		"fadd	v1.2d,v1.2d,v1.2d	\n\t"/* it=it+it */\
		/*
			__st=__tBr* ~tCr+__tBi* ~tCi; __st=__st+__st;
			__jt=__tBi* ~tCr-__tBr* ~tCi; __jt=__jt+__jt;
		"*/\
		"ldr	x2,%[__tCr]	\n\t"\
		"ldr	x1,%[__tBr]	\n\t"\
		"ldp	q6,q7,[x2]	\n\t"/* tCr,i */\
		"ldp	q4,q5,[x1]	\n\t"/* tBr,i */\
		"ext v6.16b,v6.16b,v6.16b,#8	\n\t"/* ~tCr */\
		"ext v7.16b,v7.16b,v7.16b,#8	\n\t"/* ~tCi */\
		"fmul	v2.2d,v4.2d,v6.2d	\n\t"/* tBr*~tCr */\
		"fmul	v3.2d,v5.2d,v6.2d	\n\t"/* tBi*~tCr */\
		"fmla	v2.2d,v5.2d,v7.2d	\n\t"/* st = tBr*~tCr + tBi*~tCi */\
		"fmls	v3.2d,v4.2d,v7.2d	\n\t"/* jt = tBi*~tCr - tBr*~tCi */\
		"fadd	v2.2d,v2.2d,v2.2d	\n\t"/* st=st+st */\
		"fadd	v3.2d,v3.2d,v3.2d	\n\t"/* jt=jt+jt */\
		/* Now calculate square terms and __store back in the same temporaries:
			__tmp = (__tAr+__tAi)*(__tAr-__tAi); __tAi=__tAr*__tAi; __tAi=__tAi+__tAi; __tAr=__tmp;
		*/\
		"ldp	q6,q7,[x0]	\n\t"/* tAr,i */\
		"fsub	v4.2d,v6.2d,v7.2d	\n\t"/* (tAr-tAi) */\
		"fadd	v5.2d,v6.2d,v7.2d	\n\t"/* (tAr+tAi) */\
		"fadd	v7.2d,v7.2d,v7.2d	\n\t"/* 2*tAi */\
		"fmul	v4.2d,v4.2d,v5.2d	\n\t"\
		"fmul	v5.2d,v6.2d,v7.2d	\n\t"/* 2*tAr*tAi */\
		"stp	q4,q5,[x0]	\n\t"/* tmp-store */\
		"fsub	v0.2d,v0.2d,v4.2d	\n\t"/* rt-tAr */\
		"fsub	v1.2d,v1.2d,v5.2d	\n\t"/* it-tAi */\
		/* __tmp = (__tBr+__tBi)*(__tBr-__tBi); __tBi=__tBr*__tBi; __tBi=__tBi+__tBi; __tBr=__tmp;	[Can be done in parallel with above segment] */\
		"ldp	q6,q7,[x1]	\n\t"/* tBr,i */\
		"fsub	v4.2d,v6.2d,v7.2d	\n\t"/* (tBr-tBi) */\
		"fadd	v5.2d,v6.2d,v7.2d	\n\t"/* (tBr+tBi) */\
		"fadd	v7.2d,v7.2d,v7.2d	\n\t"/* 2*tBi */\
		"fmul	v4.2d,v4.2d,v5.2d	\n\t"\
		"fmul	v5.2d,v6.2d,v7.2d	\n\t"/* 2*tBr*tBi */\
		"stp	q4,q5,[x1]	\n\t"/* tmp-store */\
		"fsub	v2.2d,v2.2d,v4.2d	\n\t"/* st-tBr */\
		"fsub	v3.2d,v3.2d,v5.2d	\n\t"/* jt-tBi */\
		/* __tmp = (__tDr+__tDi)*(__tDr-__tDi); __tDi=__tDr*__tDi; __tDi=__tDi+__tDi; __tDr=__tmp; */\
		"ldp	q6,q7,[x3]	\n\t"/* tDr,i */\
		"fsub	v4.2d,v6.2d,v7.2d	\n\t"/* (tDr-tDi) */\
		"fadd	v5.2d,v6.2d,v7.2d	\n\t"/* (tDr+tDi) */\
		"fadd	v7.2d,v7.2d,v7.2d	\n\t"/* 2*tDi */\
		"fmul	v4.2d,v4.2d,v5.2d	\n\t"\
		"fmul	v5.2d,v6.2d,v7.2d	\n\t"/* 2*tDr*tDi */\
		"stp	q4,q5,[x3]	\n\t"/* tmp-store */\
		"ext v4.16b,v4.16b,v4.16b,#8	\n\t"/* ~tDr */\
		"ext v5.16b,v5.16b,v5.16b,#8	\n\t"/* ~tDi */\
		"fsub	v0.2d,v0.2d,v4.2d	\n\t"/* rt-__tAr- ~tDr */\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t"/* it-__tAi+ ~tDi */\
		/* __tmp = (__tCr+__tCi)*(__tCr-__tCi); __tCi=__tCr*__tCi; __tCi=__tCi+__tCi; __tCr=__tmp;	[Can be done in parallel with above segment] */\
		"ldp	q6,q7,[x2]	\n\t"/* tCr,i */\
		"fsub	v4.2d,v6.2d,v7.2d	\n\t"/* (tCr-tCi) */\
		"fadd	v5.2d,v6.2d,v7.2d	\n\t"/* (tCr+tCi) */\
		"fadd	v7.2d,v7.2d,v7.2d	\n\t"/* 2*tCi */\
		"fmul	v4.2d,v4.2d,v5.2d	\n\t"\
		"fmul	v5.2d,v6.2d,v7.2d	\n\t"/* 2*tCr*tCi */\
		"stp	q4,q5,[x2]	\n\t"/* tmp-store */\
		"ext v4.16b,v4.16b,v4.16b,#8	\n\t"/* ~tCr */\
		"ext v5.16b,v5.16b,v5.16b,#8	\n\t"/* ~tCi */\
		"fsub	v2.2d,v2.2d,v4.2d	\n\t"/* st-__tBr- ~tCr */\
		"fadd	v3.2d,v3.2d,v5.2d	\n\t"/* jt-__tBi+ ~tCi */\
		/*
			__tmp=((1.0+__c)*__rt-__s*__it)*0.25;
			__it =((1.0+__c)*__it+__s*__rt)*0.25;	__rt=__tmp;
			[Can be done in parallel with above segment]
		*/\
		"ldr	x4,%[__c]	\n\t	mov	v6.16b,v0.16b	\n\t"/* cpy rt */\
		"ldr	x5,%[__s]	\n\t	mov	v7.16b,v1.16b	\n\t"/* cpy it */\
		"ldr	x6,%[__forth]	\n\t"\
		"ld1	{v4.16b},[x4]	\n\t"\
		"ld1	{v5.16b},[x5]	\n\t"\
		"ld1	{v8.16b},[x6]	\n\t"\
		"fmla	v6.2d,v4.2d,v6.2d	\n\t"/* (c+1.0)*rt */\
		"fmla	v7.2d,v4.2d,v7.2d	\n\t"/* (c+1.0)*it */\
		"fmls	v6.2d,v5.2d,v1.2d	\n\t"/* (c+1.0)*rt - s*it */\
		"fmla	v7.2d,v5.2d,v0.2d	\n\t"/* (c+1.0)*it + s*rt */\
		"fmul	v0.2d,v8.2d,v6.2d	\n\t"/* -rt Both of these inherit the sign flip [w.r.to the non-SSE2 PAIR_SQUARE_4 macro] */\
		"fmul	v1.2d,v8.2d,v7.2d	\n\t"/* -it that resulted from the in-place-friendlier (rt-__tAr- ~tDr) reordering above. */\
		/*
			__tmp=((1.0-__s)*__st-__c*__jt)*0.25;
			__jt =((1.0-__s)*__jt+__c*__st)*0.25	__st=__tmp;
			[Can be done in parallel with above segment]
		*/\
		"eor v7.16b,v7.16b,v7.16b	\n\t"/* 0.0 ... note NEG of v2.2d,v3.2d (result into v6.2d,v7.2d) gave incorrect results! [To-do: see if that was GCC emitting wrong hardware instruction, or something else.] */\
		"fsub	v6.2d,v7.2d,v2.2d	\n\t"/* -cpy st */\
		"fsub	v7.2d,v7.2d,v3.2d	\n\t"/* -cpy jt */\
		"fmla	v6.2d,v5.2d,v2.2d	\n\t"/* (s-1.0)*st, note sign flip! */\
		"fmla	v7.2d,v5.2d,v3.2d	\n\t"/* (s-1.0)*jt, note sign flip! */\
		"fmla	v6.2d,v4.2d,v3.2d	\n\t"/* (s-1.0)*st + c*jt */\
		"fmls	v7.2d,v4.2d,v2.2d	\n\t"/* (s-1.0)*jt - c*st */\
		"fmul	v2.2d,v8.2d,v6.2d	\n\t"/* +st Sign flip due to (s-1.0) reordering here */\
		"fmul	v3.2d,v8.2d,v7.2d	\n\t"/* +jt cancels earlier one due to in-place-friendlier (st-__tBr- ~tCr) reordering above. */\
		/*...and now complete and store the results. We flip the signs on st and jt here to undo the above -st,-jt negations. */\
		/*	__tAr = (__tAr+__rt);
			__tAi = (__tAi+__it);
			__tBr = (__tBr-__st);
			__tBi = (__tBi-__jt);
		*/\
		"ldp	q4,q5,[x0]	\n\t"/* tAr,i */\
		"ldp	q6,q7,[x1]	\n\t"/* tBr,i */\
		"fadd	v4.2d,v4.2d,v0.2d	\n\t"/* (__tAr+__rt) */\
		"fadd	v5.2d,v5.2d,v1.2d	\n\t"/* (__tAi+__it) */\
		"fsub	v6.2d,v6.2d,v2.2d	\n\t"/* (__tBr-__st) */\
		"fsub	v7.2d,v7.2d,v3.2d	\n\t"/* (__tBi-__jt) */\
		"stp	q4,q5,[x0]	\n\t"\
		"stp	q6,q7,[x1]	\n\t"\
		/*...N-j terms are as above, but with the replacements: __tAr<--> ~tDr, __tAi<--> ~tDi, __it --> -__it. */\
		/*	__tDr = (__tDr+ ~rt);
			__tDi = (__tDi- ~it);
			__tCr = (__tCr- ~st);
			__tCi = (__tCi+ ~jt);
		*/\
		"ldp	q4,q5,[x3]	\n\t"/* tDr,i */\
		"ldp	q6,q7,[x2]	\n\t"/* tCr,i */\
		"ext v0.16b,v0.16b,v0.16b,#8	\n\t"/* ~rt */\
		"ext v1.16b,v1.16b,v1.16b,#8	\n\t"/* ~it */\
		"ext v2.16b,v2.16b,v2.16b,#8	\n\t"/* ~st */\
		"ext v3.16b,v3.16b,v3.16b,#8	\n\t"/* ~jt */\
		"fadd	v4.2d,v4.2d,v0.2d	\n\t"/* (__tDr+ ~rt) */\
		"fsub	v5.2d,v5.2d,v1.2d	\n\t"/* (__tDi- ~it) */\
		"fsub	v6.2d,v6.2d,v2.2d	\n\t"/* (__tCr- ~st) */\
		"fadd	v7.2d,v7.2d,v3.2d	\n\t"/* (__tCi+ ~jt) */\
		"stp	q4,q5,[x3]	\n\t"\
		"stp	q6,q7,[x2]	\n\t"\
		/* Cost (FMA = MUL): [43 vector-load/store (20 pairwise, 3 ld1), 12 shufpd, 34 addpd, 28 mulpd, 2 vector-register-copy] */\
		/* Compare w/PAIR_MUL[35 vector-load/store (32 pairwise, 3 ld1), 12 shufpd, 18 addpd, 32 mulpd, 1 vector-register-copy] ... using more vregs in PAIR_MUL_4 worth it! */\
		:					/* outputs: none */\
		: [__tAr] "m" (XtAr)	/* All inputs from memory addresses here */\
		 ,[__tBr] "m" (XtBr)\
		 ,[__tCr] "m" (XtCr)\
		 ,[__tDr] "m" (XtDr)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","v0","v1","v2","v3","v4","v5","v6","v7","v8"	/* Clobbered registers */\
	);\
	}

	// Sep 2019: 2-input FFT(a)*FFT(b) version of above PAIR_SQUARE_4_SSE2 macro, based on PAIR_MUL_4 macro in pair_square.h:
	// NOTE: Unlike the PAIR_SQUARE_4 version of this macro, the MUL version assumes the sincos terms premultiplied by 1/4!
	#define PAIR_MUL_4_SSE2(XA0,XA1,XA2,XA3, XB0,XB1,XB2,XB3, Xc,Xs,Xforth)\
	{\
	__asm__ volatile (\
		/* Load a2,a3 and b2,b3, d0,d1-swap, then compute
			t0 = ~a3r*~b3r - ~a3i*~b3i, t2 = ~a3r*~b3i + ~a3i*~b3r
			t1 = ~a2r*~b2r - ~a2i*~b2i, t3 = ~a2r*~b2i + ~a2i*~b2r
		*/\
		"ldr	x2,%[__A2]	\n\t"\
		"ldr	x3,%[__A3]	\n\t"\
		"ldr	x4,%[__B2]	\n\t"\
		"ldr	x5,%[__B3]	\n\t"\
		/* Must load double-pairs-to-be-swapped into regs first, since SHUFPD takes low double from DEST and high from SRC: */\
		"ldp	q0,q1,[x2]	\n\t"/* a2 */\
		"ldp	q4,q5,[x4]	\n\t"/* b2 */\
		"ext v0.16b,v0.16b,v0.16b,#8	\n\t	ext v1.16b,v1.16b,v1.16b,#8	\n\t"/* ~a2 */\
		"ext v4.16b,v4.16b,v4.16b,#8	\n\t	ext v5.16b,v5.16b,v5.16b,#8	\n\t"/* ~b2 */\
		"ldp	q2,q3,[x3]	\n\t"/* a3 */\
		"ldp	q6,q7,[x5]	\n\t"/* b3 */\
		"ext v2.16b,v2.16b,v2.16b,#8	\n\t	ext v3.16b,v3.16b,v3.16b,#8	\n\t"/* ~a3 */\
		"ext v6.16b,v6.16b,v6.16b,#8	\n\t	ext v7.16b,v7.16b,v7.16b,#8	\n\t"/* ~b3 */\
		"fmul	v8.2d ,v4.2d,v0.2d	\n\t"/* ~a2r*~b2r */\
		"fmul	v9.2d ,v5.2d,v0.2d	\n\t"/* ~a2r*~b2i */\
		"fmul	v10.2d,v6.2d,v2.2d	\n\t"/* ~a3r*~b3r */\
		"fmul	v11.2d,v7.2d,v2.2d	\n\t"/* ~a3r*~b3i */\
		"fmls	v8.2d ,v5.2d,v1.2d	\n\t"/* t1 = ~a2r*~b2r - ~a2i*~b2i */\
		"fmla	v9.2d ,v4.2d,v1.2d	\n\t"/* t3 = ~a2r*~b2i + ~a2i*~b2r */\
		"fmls	v10.2d,v7.2d,v3.2d	\n\t"/* t0 = ~a3r*~b3r - ~a3i*~b3i */\
		"fmla	v11.2d,v6.2d,v3.2d	\n\t"/* t2 = ~a3r*~b3i + ~a3i*~b3r */\
		/* t1,3 and t0,2 not needed until final butterfly sequence, so write back to A2,3 memlocs: */\
		"stp	q8 ,q9 ,[x2]	\n\t"\
		"stp	q10,q11,[x3]	\n\t"\
	/* a2,3 in v0-3, b2,3 in v4-7, t1,3 in (x2), t0,2 in (x3) */\
		/* calculate difference terms...these need the [a,b][2|3] vector-data to be d0,1-swapped:
			~a3r -= a0r, ~a3i += a0i,
			~a2r -= a1r, ~a2i += a1i, similar for b-data, but move ~b2 -+ b1 down to just before a1*b1 cmul to free up 2 regs.
		*/\
/*** Need ~a3r = a0r - ~a3r, not ~a3r -= a0r! [Similar for a2r,b3r,b2r] ***
************** As currently, a2r,a3r,b2r,b3r all negated! ****************/\
		"ldr	x0,%[__A0]	\n\t"\
		"ldr	x1,%[__A1]	\n\t"\
		"ldr	x4,%[__B0]	\n\t"\
		"ldr	x5,%[__B1]	\n\t"\
		"ldp	q16,q17,[x0]	\n\t"/* a0 */\
		"ldp	q18,q19,[x1]	\n\t"/* a1 */\
		"ldp	q12,q13,[x4]	\n\t"/* b0 */\
		"ldp	q14,q15,[x5]	\n\t"/* b1 */\
		"fsub	v2.2d,v2.2d,v16.2d	\n\t"/* ~a3r -= a0r */\
		"fadd	v3.2d,v3.2d,v17.2d	\n\t"/* ~a3i += a0i */\
		"fsub	v0.2d,v0.2d,v18.2d	\n\t"/* ~a2r -= a1r */\
		"fadd	v1.2d,v1.2d,v19.2d	\n\t"/* ~a2i += a1i */\
		"fsub	v6.2d,v6.2d,v12.2d	\n\t"/* ~b3r -= b0r */\
		"fadd	v7.2d,v7.2d,v13.2d	\n\t"/* ~b3i += b0i */\
		"fsub	v4.2d,v4.2d,v14.2d	\n\t"/* ~b2r -= b1r */\
		"fadd	v5.2d,v5.2d,v15.2d	\n\t"/* ~b2i += b1i */\
		/* now calculate 1st square-like term and store back in H(j) slot:
			t4 = a0r*b0r - a0i*b0i, a0i = a0r*b0i + a0i*b0r, a0r = t4
			t5 = a1r*b1r - a1i*b1i, a1i = a1r*b1i + a1i*b1r, a1r = t5
		*/\
		"fmul	v8.2d ,v12.2d,v16.2d	\n\t"/* a0r*b0r */\
		"fmul	v9.2d ,v13.2d,v16.2d	\n\t"/* a0r*b0i */\
		"fmul	v10.2d,v14.2d,v18.2d	\n\t"/* a1r*b1r */\
		"fmul	v11.2d,v15.2d,v18.2d	\n\t"/* a1r*b1i */\
		"fmls	v8.2d ,v13.2d,v17.2d	\n\t"/* a0r' = a0r*b0r - a0i*b0i */\
		"fmla	v9.2d ,v12.2d,v17.2d	\n\t"/* a0i' = a0r*b0i + a0i*b0r */\
		"fmls	v10.2d,v15.2d,v19.2d	\n\t"/* a1r' = a1r*b1r - a1i*b1i */\
		"fmla	v11.2d,v14.2d,v19.2d	\n\t"/* a1i' = a1r*b1i + a1i*b1r */\
	/* a0,1 in v8-11, a2,3 in v0-3, b2,3 in v4-7, t1,3 in (x2), t0,2 in (x3) */\
		/* calculate the complex products to build the second term:
			t4 = ~a3r*~b3r - ~a3i*~b3i, ~a3i = ~a3r*~b3i + ~a3i*~b3r, ~a3r,i in v2,3, ~b3r,i in v6,7
			t5 = ~a2r*~b2r - ~a2i*~b2i, ~a2i = ~a2r*~b2i + ~a2i*~b2r, ~arr,i in v0,1, ~b2r,i in v4,5
		*/\
/****************** a2r,a3r,b2r,b3r being negated means a2i,a3i come out negated ****************/\
		"fmul	v12.2d,v4.2d,v0.2d	\n\t"/* ~a2r*~b2r */\
		"fmul	v13.2d,v5.2d,v0.2d	\n\t"/* ~a2r*~b2i */\
		"fmul	v14.2d,v6.2d,v2.2d	\n\t"/* ~a3r*~b3r */\
		"fmul	v15.2d,v7.2d,v2.2d	\n\t"/* ~a3r*~b3i */\
		"fmls	v12.2d,v5.2d,v1.2d	\n\t"/* t5   = ~a2r*~b2r - ~a2i*~b2i */\
		"fmla	v13.2d,v4.2d,v1.2d	\n\t"/* ~a2i = ~a2r*~b2i + ~a2i*~b2r */\
		"fmls	v14.2d,v7.2d,v3.2d	\n\t"/* t4   = ~a3r*~b3r - ~a3i*~b3i */\
		"fmla	v15.2d,v6.2d,v3.2d	\n\t"/* ~a3i = ~a3r*~b3i + ~a3i*~b3r */\
/*** sse2 code has t4,~a3i in xmm2,3, t5,~a2i in xmm0,1 ***/\
		/* v0-7 free */\
		/* Assume [c0,s1],[s0,c1] sincos vector-data are in the [c] and [s]-input-pointers, then compute
			~a3r = [cc+0.25]*t4 - [ss]*~a3i, ~a3i = [ss]*t4 + [cc+0.25]*~a3i
			~a2r = [0.25-ss]*t5 - [cc]*~a2i, ~a2i = [cc]*t5 + [0.25-ss]*~a2i ,
		where cc = 0.25*[c0,s1] and ss = 0.25*[s0,c1]:
		*/\
/****************** a2i,a3i being negated requires +- sign swap in this next computation ****************/\
		"ldr	x4,%[__forth]		\n\t	ld1	{v6.16b},[x4]	\n\t	mov	v7.16b,v6.16b	\n\t"/* 2 copies of 0.25 */\
		"ldr	x4,%[__c]			\n\t	ld1	{v4.16b},[x4]	\n\t"/*	cc assumed premultiplied by 0.25 */\
		"ldr	x5,%[__s]			\n\t	ld1	{v5.16b},[x5]	\n\t"/*	ss assumed premultiplied by 0.25 */\
		"fadd	v6.2d,v6.2d,v4.2d	\n\t	fsub	v7.2d,v7.2d,v5.2d	\n\t"/* [cc+0.25],[0.25-ss] in v6,7 */\
		"fmul	v2.2d,v6.2d,v14.2d	\n\t"/*   t4*[cc+0.25] */\
		"fmul	v3.2d,v5.2d,v14.2d	\n\t"/*   t4*[ss] */\
		"fmul	v0.2d,v7.2d,v12.2d	\n\t"/*   t5*[0.25-ss] */\
		"fmul	v1.2d,v4.2d,v12.2d	\n\t"/*   t5*[cc] */\
		"fmla	v2.2d,v5.2d,v15.2d	\n\t"/* ~a3i*[ss] */\
		"fmls	v3.2d,v6.2d,v15.2d	\n\t"/* ~a3i*[cc+0.25]; ~a3r,~a3i in v2,3 */\
		"fmla	v0.2d,v4.2d,v13.2d	\n\t"/* ~a2i*[cc] */\
		"fmls	v1.2d,v7.2d,v13.2d	\n\t"/* ~a2i*[0.25-ss]; ~a2r,~a2i in v0,1 */\
	/* a0,1 in v8-11, a2,3 in v0-3, t1,3 in (x2), t0,2 in (x3) */\
		"ldp	q4,q5,[x3]	\n\t"/* t0,2 */\
		"ldp	q6,q7,[x2]	\n\t"/* t1,3 */\
	/* and now complete and store the results:
		a0r -= ~a3r, a0i -= ~a3i
		a1r -= ~a2r, a1i -= ~a2i
	N-j terms:
		~a3r = t0 - ~a3r, ~a3i += t2
		~a2r = t1 - ~a2r, ~a2i += t3
	*/\
/****************** a2i,a3i in v1,3; *NOT* negated as in the sse2 case ****************/\
		"fsub	v8.2d ,v8.2d ,v2.2d	\n\t	fsub	v9.2d ,v9.2d ,v3.2d	\n\t"	/* a0r,i in v8 ,9 ; ~a3r,i in v2,3 */\
		"fsub	v10.2d,v10.2d,v0.2d	\n\t	fsub	v11.2d,v11.2d,v1.2d	\n\t"	/* a1r,i in v10,11; ~a2r,i in v0,1 */\
		"fsub	v4.2d ,v4.2d ,v2.2d	\n\t	fadd	v5.2d ,v5.2d ,v3.2d	\n\t"	/* t0,2 in v4,5 */\
		"fsub	v6.2d ,v6.2d ,v0.2d	\n\t	fadd	v7.2d ,v7.2d ,v1.2d	\n\t"	/* t1,3 in v6,7 */\
	/* Interleave writes of a0,a1 with un-shufflings of ~a2,~a3: */\
		"stp	q8 ,q9 ,[x0]	\n\t"\
		"stp	q10,q11,[x1]	\n\t"\
		"ext v4.16b,v4.16b,v4.16b,#8	\n\t	ext v5.16b,v5.16b,v5.16b,#8	\n\t"/* ~a3 */\
		"ext v6.16b,v6.16b,v6.16b,#8	\n\t	ext v7.16b,v7.16b,v7.16b,#8	\n\t"/* ~a2 */\
		"stp	q4 ,q5 ,[x3]	\n\t"\
		"stp	q6 ,q7 ,[x2]	\n\t"\
		/* Cost (FMA = MUL): [35 vector-load/store (32 pairwise, 3 ld1), 12 shufpd, 18 addpd, 32 mulpd,  1 vector-register-copy] */\
		/* Compare vs. SSE2: [35 vector-load/store (0 implicit)        , 12 shufpd, 34 addpd, 32 mulpd, 21 vector-register-copy] */\
		:					/* outputs: none */\
		: [__A0] "m" (XA0)	/* All inputs from memory addresses here */\
		 ,[__A1] "m" (XA1)\
		 ,[__A2] "m" (XA2)\
		 ,[__A3] "m" (XA3)\
		 ,[__B0] "m" (XB0)\
		 ,[__B1] "m" (XB1)\
		 ,[__B2] "m" (XB2)\
		 ,[__B3] "m" (XB3)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16","v17","v18","v19"	/* Clobbered registers */\
	);\
	}

  #ifdef DFT3_V1	// Basic version:
	#define SSE2_RADIX_03_DFT(Xi0,Xi1,Xi2, Xcc1, Xo0,Xo1,Xo2)\
	{\
	__asm__ volatile (\
		"ldr	x1,%[__i1]			\n\t"\
		"ldr	x0,%[__i0]			\n\t	ldp	q4,q5,[x1]		\n\t"\
		"ldr	x2,%[__i2]			\n\t	ldp	q0,q1,[x0]		\n\t"\
		"ldr	x3,%[__cc1]			\n\t	ldp	q6,q7,[x2]		\n\t"\
		"fadd	v2.2d,v4.2d,v6.2d	\n\t	ldr	x1,%[__o1]		\n\t"\
		"fadd	v3.2d,v5.2d,v7.2d	\n\t	ldr	x0,%[__o0]		\n\t"\
		"fsub	v4.2d,v4.2d,v6.2d	\n\t	ldr	x2,%[__o2]		\n\t"\
		"fsub	v5.2d,v5.2d,v7.2d	\n\t	ldp	q6,q7,[x3]		\n\t"/* cc1 */\
		"fadd	v0.2d,v0.2d,v2.2d	\n\t	fmul	v2.2d,v2.2d,v6.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v3.2d	\n\t	fmul	v3.2d,v3.2d,v6.2d	\n\t"\
		"fmul	v4.2d,v4.2d,v7.2d	\n\t	stp	q0,q1,[x0]		\n\t"\
		"fmul	v5.2d,v5.2d,v7.2d	\n\t	fadd	v2.2d,v2.2d,v0.2d	\n\t"\
		"fadd	v0.2d,v2.2d,v5.2d	\n\t	fadd	v3.2d,v3.2d,v1.2d	\n\t"\
		"fsub	v1.2d,v3.2d,v4.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v5.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v4.2d	\n\t"\
		"stp	q0,q1,[x2]			\n\t"\
		"stp	q2,q3,[x1]			\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		: "cc","memory","x0","x1","x2","x3","v0","v1","v2","v3","v4","v5","v6","v7"	/* Clobbered registers */\
	);\
	}
  #else	// Version 2 replaces final [4 MUL, 6 ADD] sequence with [6 FMA]. Due to ARMv8's restricted
		// FMA syntax, in which output register must hold addend, this costs 4 additional reg-copies:
	#define SSE2_RADIX_03_DFT(Xi0,Xi1,Xi2, Xcc1, Xo0,Xo1,Xo2)\
	{\
	__asm__ volatile (\
		"ldr	x1,%[__i1]			\n\t"\
		"ldr	x2,%[__i2]			\n\t	ldp	q4,q5,[x1]		\n\t"\
		"ldr	x0,%[__i0]			\n\t"\
		"ldr	x3,%[__cc1]			\n\t	ldp	q6,q7,[x2]		\n\t"\
		"fadd	v8.2d,v4.2d,v6.2d	\n\t"\
		"fadd	v9.2d,v5.2d,v7.2d	\n\t	ldp	q0,q1,[x0]		\n\t"\
		"fsub	v4.2d,v4.2d,v6.2d	\n\t"\
		"fsub	v5.2d,v5.2d,v7.2d	\n\t	ldr	x0,%[__o0]		\n\t"\
		"fadd	v0.2d,v0.2d,v8.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v9.2d	\n\t	ldp	q6,q7,[x3]		\n\t"/* cc1 */\
		"stp	q0,q1,[x0]		\n\t"\
		"mov	v2.16b,v0.16b	\n\t"\
		"mov	v3.16b,v1.16b	\n\t"\
		"fmla	v2.2d,v8.2d,v6.2d	\n\t"\
		"fmla	v3.2d,v9.2d,v6.2d	\n\t"\
		"mov	v0.16b,v2.16b	\n\t"\
		"mov	v1.16b,v3.16b	\n\t"\
		"fmla	v0.2d,v7.2d,v5.2d	\n\t	ldr	x2,%[__o2]		\n\t"\
		"fmls	v1.2d,v7.2d,v4.2d	\n\t"\
		"fmls	v2.2d,v7.2d,v5.2d	\n\t	ldr	x1,%[__o1]		\n\t"\
		"fmla	v3.2d,v7.2d,v4.2d	\n\t"\
		"stp	q0,q1,[x2]			\n\t"\
		"stp	q2,q3,[x1]			\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		: "cc","memory","x0","x1","x2","x3","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9"	/* Clobbered registers */\
	);\
	}
  #endif

	// Dual-dataset version of above - just do the two 3-DFTs side by side using separate
	// register sets except for those holding the sincos data, which can be shared:
	#define SSE2_RADIX_03_DFT_X2(Xcc0, Xi0,Xi1,Xi2, Xo0,Xo1,Xo2, Xj0,Xj1,Xj2, Xu0,Xu1,Xu2)\
	{\
	__asm__ volatile (\
		"ldr	x1,%[__i1]			\n\t							"\
		"ldr	x0,%[__i0]			\n\t	ldp	q4,q5,[x1]		\n\t"\
		"ldr	x2,%[__i2]			\n\t	ldp	q0,q1,[x0]		\n\t	ldr	x5,%[__j1]			\n\t"\
		"ldr	x3,%[__cc0]			\n\t	ldp	q6,q7,[x2]		\n\t	ldr	x4,%[__j0]			\n\t	ldp	q12,q13,[x5]		\n\t"\
		"fadd	v2.2d,v4.2d,v6.2d	\n\t	ldr	x1,%[__o1]		\n\t	ldr	x6,%[__j2]			\n\t	ldp	q8 ,q9 ,[x4]		\n\t"\
		"fadd	v3.2d,v5.2d,v7.2d	\n\t	ldr	x0,%[__o0]		\n\t									ldp	q14,q15,[x6]		\n\t"\
		"fsub	v4.2d,v4.2d,v6.2d	\n\t	ldr	x2,%[__o2]		\n\t	fadd	v10.2d,v12.2d,v14.2d\n\t	ldr	x5,%[__u1]		\n\t"\
		"fsub	v5.2d,v5.2d,v7.2d	\n\t ldp q6,q7,[x3]/* cc1 */\n\t	fadd	v11.2d,v13.2d,v15.2d\n\t	ldr	x4,%[__u0]		\n\t"\
		"fadd	v0.2d,v0.2d,v2.2d	\n\t fmul v2.2d,v2.2d,v6.2d	\n\t	fsub	v12.2d,v12.2d,v14.2d\n\t	ldr	x6,%[__u2]		\n\t"\
		"fadd	v1.2d,v1.2d,v3.2d	\n\t fmul v3.2d,v3.2d,v6.2d	\n\t	fsub	v13.2d,v13.2d,v15.2d\n\t"\
		"fmul	v4.2d,v4.2d,v7.2d	\n\t stp	q0,q1,[x0]		\n\t	fadd	v8.2d,v8.2d,v10.2d	\n\t fmul v10.2d,v10.2d,v6.2d	\n\t"\
		"fmul	v5.2d,v5.2d,v7.2d	\n\t fadd v2.2d,v2.2d,v0.2d	\n\t	fadd	v9.2d,v9.2d,v11.2d	\n\t fmul v11.2d,v11.2d,v6.2d	\n\t"\
		"fadd	v0.2d,v2.2d,v5.2d	\n\t fadd v3.2d,v3.2d,v1.2d	\n\t	fmul	v12.2d,v12.2d,v7.2d	\n\t stp	q8,q9,[x4]		\n\t"\
		"fsub	v1.2d,v3.2d,v4.2d	\n\t								fmul	v13.2d,v13.2d,v7.2d	\n\t fadd v10.2d,v10.2d,v8.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v5.2d	\n\t								fadd	v8.2d,v10.2d,v13.2d	\n\t fadd v11.2d,v11.2d,v9.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v4.2d	\n\t								fsub	v9.2d,v11.2d,v12.2d	\n\t"\
		"stp	q0,q1,[x2]			\n\t								fsub	v10.2d,v10.2d,v13.2d	\n\t"\
		"stp	q2,q3,[x1]			\n\t								fadd	v11.2d,v11.2d,v12.2d	\n\t"\
		"																stp	q8 ,q9 ,[x6]			\n\t"\
		"																stp	q10,q11,[x5]			\n\t"\
		:					/* outputs: none */\
		: [__cc0] "m" (Xcc0)	/* All inputs from memory addresses here */\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		: "cc","memory","x0","x1","x2","x3","v0","v1","v2","v3","v4","v5","v6","v7",\
			"x4","x5","x6","v8","v9","v10","v11","v12","v13","v14","v15"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX4_DIF_0TWIDDLE_STRIDE(Xadd0, Xadd1, Xadd2, Xadd3, Xtmp, Xstride)\
	{\
	__asm__ volatile (\
		"ldr	w4,%[__stride]		\n\t"\
		"ldr	x0,%[__tmp]			\n\t	add	x2,x0,x4,lsl #1	\n\t"\
		"ldp	q0,q1,[x0]			\n\t	add	x1,x0,x4		\n\t"\
		"ldp	q8,q9,[x2]			\n\t	add	x3,x1,x4,lsl #1	\n\t"\
		"fsub	v4.2d,v0.2d,v8.2d	\n\t"\
		"fsub	v5.2d,v1.2d,v9.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v8.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v9.2d	\n\t"\
		"ldp	q2,q3,[x1]			\n\t"\
		"ldp	q8,q9,[x3]			\n\t"\
		"fsub	v6.2d,v2.2d,v8.2d	\n\t"\
		"fsub	v7.2d,v3.2d,v9.2d	\n\t"\
		"fadd	v2.2d,v2.2d,v8.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v9.2d	\n\t"\
		/* Finish radix-4 butterfly and store results into main-array slots: */\
		"ldr	x0,%[__add0]		\n\t"\
		"ldr	x1,%[__add1]		\n\t"\
		"ldr	x2,%[__add2]		\n\t"\
		"ldr	x3,%[__add3]		\n\t"\
		"fsub	v8.2d ,v0.2d,v2.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v3.2d	\n\t"\
		"fsub	v10.2d,v4.2d,v7.2d	\n\t"\
		"fsub	v11.2d,v5.2d,v6.2d	\n\t"\
		"fadd	v2.2d ,v0.2d,v2.2d	\n\t"\
		"fadd	v3.2d ,v1.2d,v3.2d	\n\t"\
		"fadd	v7.2d ,v4.2d,v7.2d	\n\t"\
		"fadd	v6.2d ,v5.2d,v6.2d	\n\t"\
		"stp	q2 ,q3 ,[x0]		\n\t"\
		"stp	q8 ,q9 ,[x1]		\n\t"\
		"stp	q10,q6 ,[x2]		\n\t"\
		"stp	q7 ,q11,[x3]		\n\t"\
		:					/* outputs: none */\
		: [__add0] "m" (Xadd0)	/* All inputs from memory addresses here */\
		 ,[__add1] "m" (Xadd1)\
		 ,[__add2] "m" (Xadd2)\
		 ,[__add3] "m" (Xadd3)\
		 ,[__tmp] "m" (Xtmp)\
		 ,[__stride] "m" (Xstride)\
		: "cc","memory","x0","x1","x2","x3","x4","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"	/* Clobbered registers */\
	);\
	}

	/* DIF radix-4 subconvolution, sans twiddles, inputs in __i0-3, outputs in __o0-3, possibly coincident with inputs: */
	#define SSE2_RADIX4_DIF_0TWIDDLE_STRIDE_E(Xi0,Xi1,Xi2,Xi3, Xo0,Xo1,Xo2,Xo3)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__i0]		\n\t"\
		"ldr	x1,%[__i1]		\n\t"\
		"ldr	x2,%[__i2]		\n\t"\
		"ldr	x3,%[__i3]		\n\t"\
		"ldp	q0,q1,[x0]			\n\t"\
		"ldp	q8,q9,[x2]			\n\t"\
		"fsub	v4.2d,v0.2d,v8.2d	\n\t"\
		"fsub	v5.2d,v1.2d,v9.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v8.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v9.2d	\n\t"\
		"ldp	q2,q3,[x1]			\n\t"\
		"ldp	q8,q9,[x3]			\n\t"\
		"fsub	v6.2d,v2.2d,v8.2d	\n\t"\
		"fsub	v7.2d,v3.2d,v9.2d	\n\t"\
		"fadd	v2.2d,v2.2d,v8.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v9.2d	\n\t"\
		/* Finish radix-4 butterfly and store results into main-array slots: */\
		"ldr	x0,%[__o0]		\n\t"\
		"ldr	x1,%[__o1]		\n\t"\
		"ldr	x2,%[__o2]		\n\t"\
		"ldr	x3,%[__o3]		\n\t"\
		"fsub	v8.2d ,v0.2d,v2.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v3.2d	\n\t"\
		"fsub	v10.2d,v4.2d,v7.2d	\n\t"\
		"fsub	v11.2d,v5.2d,v6.2d	\n\t"\
		"fadd	v2.2d ,v0.2d,v2.2d	\n\t"\
		"fadd	v3.2d ,v1.2d,v3.2d	\n\t"\
		"fadd	v7.2d ,v4.2d,v7.2d	\n\t"\
		"fadd	v6.2d ,v5.2d,v6.2d	\n\t"\
		"stp	q2 ,q3 ,[x0]		\n\t"\
		"stp	q8 ,q9 ,[x1]		\n\t"\
		"stp	q10,q6 ,[x2]		\n\t"\
		"stp	q7 ,q11,[x3]		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		: "cc","memory","x0","x1","x2","x3","x4","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX4_DIT_0TWIDDLE_STRIDE(Xadd0, Xadd1, Xadd2, Xadd3, Xtmp, Xstride)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__add0]		\n\t"\
		"ldr	x1,%[__add1]		\n\t"\
		"ldr	x2,%[__add2]		\n\t"\
		"ldr	x3,%[__add3]		\n\t"\
		"ldp	q0,q1,[x0]			\n\t"\
		"ldp	q8,q9,[x1]			\n\t"\
		"fsub	v2.2d,v0.2d,v8.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v9.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v8.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v9.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t"\
		"ldp	q8,q9,[x3]			\n\t"\
		"fsub	v6.2d,v4.2d,v8.2d	\n\t"\
		"fsub	v7.2d,v5.2d,v9.2d	\n\t"\
		"fadd	v4.2d,v4.2d,v8.2d	\n\t"\
		"fadd	v5.2d,v5.2d,v9.2d	\n\t"\
		/* Finish radix-4 butterfly and store results into main-array slots: */\
		"ldr	w4,%[__stride]		\n\t"\
		"ldr	x0,%[__tmp]			\n\t	add	x2,x0,x4,lsl #1	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	add	x1,x0,x4		\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	add	x3,x1,x4,lsl #1	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t"\
		"stp	q4 ,q5 ,[x0]		\n\t"\
		"stp	q7 ,q11,[x1]		\n\t"\
		"stp	q8 ,q9 ,[x2]		\n\t"\
		"stp	q10,q6 ,[x3]		\n\t"\
		:					/* outputs: none */\
		: [__add0] "m" (Xadd0)	/* All inputs from memory addresses here */\
		 ,[__add1] "m" (Xadd1)\
		 ,[__add2] "m" (Xadd2)\
		 ,[__add3] "m" (Xadd3)\
		 ,[__tmp] "m" (Xtmp)\
		 ,[__stride] "m" (Xstride)\
		: "cc","memory","x0","x1","x2","x3","x4","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"	/* Clobbered registers */\
	);\
	}

	/* DIT radix-4 subconvolution, sans twiddles, inputs in __i0-3, outputs in __o0-3, possibly coincident with inputs: */
	#define SSE2_RADIX4_DIT_0TWIDDLE_STRIDE_E(Xi0,Xi1,Xi2,Xi3, Xo0,Xo1,Xo2,Xo3)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__i0]		\n\t"\
		"ldr	x1,%[__i1]		\n\t"\
		"ldr	x2,%[__i2]		\n\t"\
		"ldr	x3,%[__i3]		\n\t"\
		"ldp	q0,q1,[x0]			\n\t"\
		"ldp	q8,q9,[x1]			\n\t"\
		"fsub	v2.2d,v0.2d,v8.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v9.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v8.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v9.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t"\
		"ldp	q8,q9,[x3]			\n\t"\
		"fsub	v6.2d,v4.2d,v8.2d	\n\t"\
		"fsub	v7.2d,v5.2d,v9.2d	\n\t"\
		"fadd	v4.2d,v4.2d,v8.2d	\n\t"\
		"fadd	v5.2d,v5.2d,v9.2d	\n\t"\
		/* Finish radix-4 butterfly and store results into main-array slots: */\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	ldr	x0,%[__o0]		\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	ldr	x1,%[__o1]		\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	ldr	x2,%[__o2]		\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	ldr	x3,%[__o3]		\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t"\
		"stp	q4 ,q5 ,[x0]		\n\t"\
		"stp	q7 ,q11,[x1]		\n\t"\
		"stp	q8 ,q9 ,[x2]		\n\t"\
		"stp	q10,q6 ,[x3]		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		: "cc","memory","x0","x1","x2","x3","x4","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_04_DIF_3TWIDDLE_X2(Xin0,Xin1,Xin2,Xin3,Xidiff, Xtwo,Xcc0,Xroff, Xout0,Xout1,Xout2,Xout3,Xodiff)\
	{\
	__asm__ volatile (\
		"ldr	x4,%[__cc0]			\n\t	ldr	w14,%[__roff]		\n\t"\
		"ldr	x0,%[__in0]			\n\t"\
		"ldr	x1,%[__in1]			\n\t"\
		"ldr	x2,%[__in2]			\n\t"\
		"ldr	x3,%[__in3]			\n\t	add	x14, x4,x14			\n\t"\
		"ldr	w5,%[__idiff]		\n\t	add	x10, x0,x5			\n\t"\
		"ldp	q4,q5,[x2]			\n\t	add	x11, x1,x5			\n\t"\
		"ldp	q8,q9,[x4]			\n\t	add	x12, x2,x5			\n\t"\
		"ldp	q0,q1,[x0]			\n\t	add	x13, x3,x5			\n\t"\
		"fmul	v6.2d,v4.2d,v8.2d	\n\t"\
		"fmul	v7.2d,v5.2d,v8.2d	\n\t	ldp	q16,q17,[x12]			\n\t"\
		"fmls	v6.2d,v5.2d,v9.2d	\n\t	ldp	q20,q21,[x14]			\n\t"/* cc0 */\
		"fmla	v7.2d,v4.2d,v9.2d	\n\t	ldp	q12,q13,[x10]			\n\t"\
		"fsub	v2.2d ,v0.2d,v6.2d	\n\t fmul	v18.2d,v16.2d,v20.2d	\n\t"/* twiddle-mul: */\
		"fsub	v3.2d ,v1.2d,v7.2d	\n\t fmul	v19.2d,v17.2d,v20.2d	\n\t"\
		"fadd	v10.2d,v0.2d,v6.2d	\n\t fmls	v18.2d,v17.2d,v21.2d	\n\t"\
		"fadd	v11.2d,v1.2d,v7.2d	\n\t fmla	v19.2d,v16.2d,v21.2d	\n\t"\
		"ldp	q8,q9,[x4,#0x40]	\n\t fsub	v14.2d,v12.2d,v18.2d	\n\t"/* 2 x 2 complex butterfly: */\
		"ldp	q6,q7,[x3]			\n\t fsub	v15.2d,v13.2d,v19.2d	\n\t"\
		"fmul	v0.2d,v6.2d,v8.2d	\n\t fadd	v22.2d,v12.2d,v18.2d	\n\t"\
		"fmul	v1.2d,v7.2d,v8.2d	\n\t fadd	v23.2d,v13.2d,v19.2d	\n\t"\
		"fmls	v0.2d,v7.2d,v9.2d	\n\t	ldp	q20,q21,[x14,#0x40]		\n\t"/* cc0+4 */\
		"fmla	v1.2d,v6.2d,v9.2d	\n\t	ldp	q18,q19,[x13]			\n\t"\
		"ldp	q8,q9,[x4,#0x20]	\n\t fmul	v12.2d,v18.2d,v20.2d	\n\t"/* twiddle-mul: */\
		"ldp	q6,q7,[x1]			\n\t fmul	v13.2d,v19.2d,v20.2d	\n\t"\
		"fmul	v4.2d,v6.2d,v8.2d	\n\t fmls	v12.2d,v19.2d,v21.2d	\n\t"\
		"fmul	v5.2d,v7.2d,v8.2d	\n\t fmla	v13.2d,v18.2d,v21.2d	\n\t"\
		"fmls	v4.2d,v7.2d,v9.2d	\n\t	ldp	q20,q21,[x14,#0x20]		\n\t"/* cc0+2 */\
		"fmla	v5.2d,v6.2d,v9.2d	\n\t	ldp	q18,q19,[x11]			\n\t"\
		"fadd	v6.2d,v4.2d,v0.2d	\n\t fmul	v16.2d,v18.2d,v20.2d	\n\t"/* twiddle-mul: */\
		"fadd	v7.2d,v5.2d,v1.2d	\n\t fmul	v17.2d,v19.2d,v20.2d	\n\t"\
		"fsub	v4.2d,v4.2d,v0.2d	\n\t fmls	v16.2d,v19.2d,v21.2d	\n\t"\
		"fsub	v5.2d,v5.2d,v1.2d	\n\t fmla	v17.2d,v18.2d,v21.2d	\n\t"\
		"ldr	w5,%[__odiff]		\n\t fadd	v18.2d,v16.2d,v12.2d	\n\t"/* 2 x 2 complex butterfly: */\
		"ldr	x0,%[__out0]		\n\t fadd	v19.2d,v17.2d,v13.2d	\n\t"\
		"ldr	x1,%[__out1]		\n\t fsub	v16.2d,v16.2d,v12.2d	\n\t"\
		"ldr	x2,%[__out2]		\n\t fsub	v17.2d,v17.2d,v13.2d	\n\t"\
		"ldr	x3,%[__out3]		\n\t"\
		"fsub	v8.2d,v10.2d,v6.2d	\n\t	add	x10, x0,x5			\n\t"\
		"fsub	v9.2d,v11.2d,v7.2d	\n\t	add	x11, x1,x5			\n\t"\
		"fsub	v1.2d,v3.2d,v4.2d	\n\t	add	x12, x2,x5			\n\t"\
		"fsub	v0.2d,v2.2d,v5.2d	\n\t	add	x13, x3,x5			\n\t"\
		"fadd	v6.2d,v6.2d,v10.2d	\n\t fsub	v20.2d,v22.2d,v18.2d	\n\t"\
		"fadd	v7.2d,v7.2d,v11.2d	\n\t fsub	v21.2d,v23.2d,v19.2d	\n\t"\
		"fadd	v4.2d,v4.2d,v3.2d	\n\t fsub	v13.2d,v15.2d,v16.2d	\n\t"\
		"fadd	v5.2d,v5.2d,v2.2d	\n\t fsub	v12.2d,v14.2d,v17.2d	\n\t"\
		"stp	q6,q7,[x0]			\n\t fadd	v18.2d,v18.2d,v22.2d	\n\t"\
		"stp	q0,q4,[x1]			\n\t fadd	v19.2d,v19.2d,v23.2d	\n\t"\
		"stp	q8,q9,[x2]			\n\t fadd	v16.2d,v16.2d,v15.2d	\n\t"\
		"stp	q5,q1,[x3]			\n\t fadd	v17.2d,v17.2d,v14.2d	\n\t"\
		"									stp	q18,q19,[x10]		\n\t"/* out 0 */\
		"									stp	q12,q16,[x11]		\n\t"/* out 1 */\
		"									stp	q20,q21,[x12]		\n\t"/* out 2 */\
		"									stp	q17,q13,[x13]		\n\t"/* out 3 */\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__in1] "m" (Xin1)\
		 ,[__in2] "m" (Xin2)\
		 ,[__in3] "m" (Xin3)\
		/* idiff, 'input-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's inputs */\
		 ,[__idiff] "m" (Xidiff)\
		 ,[__two] "m" (Xtwo)	/* pointer to vector-const 2.0 */\
		 ,[__cc0] "m" (Xcc0)\
		/* roff, 'roots-address offset', has literal-bytewise address offset between ptrs to 1st,2nd DFT's twiddles */\
		 ,[__roff] "m" (Xroff)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		/* odiff, 'output-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's outputs */\
		 ,[__odiff] "m" (Xodiff)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11",\
		"x10","x11","x12","x13","x14","x15","v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v22","v23"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_04_DIT_3TWIDDLE_X2(Xin0,Xin1,Xin2,Xin3,Xidiff, Xtwo,Xcc0,Xroff, Xout0,Xout1,Xout2,Xout3,Xodiff)\
	{\
	__asm__ volatile (\
		"ldr	x4,%[__cc0]			\n\t	ldr	w5,%[__idiff]	\n\t"\
		"ldr	x0,%[__in0]			\n\t	add	x10, x0,x5		\n\t"\
		"ldr	x1,%[__in1]			\n\t	add	x11, x1,x5		\n\t"\
		"ldr	x2,%[__in2]			\n\t	add	x12, x2,x5		\n\t"\
		"ldr	x3,%[__in3]			\n\t	add	x13, x3,x5		\n\t"\
		"ldp	q0,q1,[x0]			\n\t"/* Ar,i0 */\
		"ldp	q2,q3,[x1]			\n\t"/* Ar,i1 */\
		"ldp	q4,q5,[x2]			\n\t"/* Ar,i2 */\
		"ldp	q6,q7,[x3]			\n\t"/* Ar,i3 */\
		"fsub	v8.2d, v0.2d,v2.2d	\n\t	ldp	q10,q11,[x10]		\n\t"/* Ar,i4 */\
		"fsub	v9.2d, v1.2d,v3.2d	\n\t	ldp	q12,q13,[x11]		\n\t"/* Ar,i5 */\
		"fsub	v20.2d,v4.2d,v6.2d	\n\t	ldp	q14,q15,[x12]		\n\t"/* Ar,i6 */\
		"fsub	v21.2d,v5.2d,v7.2d	\n\t	ldp	q16,q17,[x13]		\n\t"/* Ar,i7 */\
		"fadd	v2.2d,v0.2d,v2.2d	\n\t	fsub	v18.2d,v10.2d,v12.2d	\n\t"\
		"fadd	v3.2d,v1.2d,v3.2d	\n\t	fsub	v19.2d,v11.2d,v13.2d	\n\t"\
		"fadd	v6.2d,v4.2d,v6.2d	\n\t	fsub	v22.2d,v14.2d,v16.2d	\n\t"\
		"fadd	v7.2d,v5.2d,v7.2d	\n\t	fsub	v23.2d,v15.2d,v17.2d	\n\t"\
		"fsub	v4.2d,v2.2d,v6.2d	\n\t	fadd	v12.2d,v10.2d,v12.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v7.2d	\n\t	fadd	v13.2d,v11.2d,v13.2d	\n\t"\
		"fsub	v0.2d,v8.2d,v21.2d	\n\t	fadd	v16.2d,v14.2d,v16.2d	\n\t"\
		"fsub	v1.2d,v9.2d,v20.2d	\n\t	fadd	v17.2d,v15.2d,v17.2d	\n\t"\
		"fadd	v6.2d,v2.2d,v6.2d	\n\t	fsub	v14.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v7.2d,v3.2d,v7.2d	\n\t	fsub	v15.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v3.2d,v8.2d,v21.2d	\n\t	fsub	v10.2d,v18.2d,v23.2d	\n\t"\
		"fadd	v2.2d,v9.2d,v20.2d	\n\t	fsub	v11.2d,v19.2d,v22.2d	\n\t"\
		"ldr	x0,%[__out0]		\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"ldr	x1,%[__out1]		\n\t	fadd	v17.2d,v13.2d,v17.2d	\n\t"\
		"ldr	x2,%[__out2]		\n\t	fadd	v13.2d,v18.2d,v23.2d	\n\t"\
		"ldr	x3,%[__out3]		\n\t	fadd	v12.2d,v19.2d,v22.2d	\n\t"\
		"ldr	w14,%[__roff]	\n\t	add	x14, x4,x14	\n\t	ldr	w5,%[__idiff]	\n\t"\
		"stp	q6,q7,[x0]			\n\t	add	x10, x0,x5		\n\t"\
		"ldp	q8,q9,[x4]			\n\t	add	x11, x1,x5		\n\t"\
		"fmul	v6.2d,v4.2d,v8.2d	\n\t	add	x12, x2,x5		\n\t"\
		"fmul	v7.2d,v5.2d,v8.2d	\n\t	add	x13, x3,x5		\n\t"\
		"fmla	v6.2d,v5.2d,v9.2d	\n\t	stp	q16,q17,[x10]	\n\t"\
		"fmls	v7.2d,v4.2d,v9.2d	\n\t	ldp	q18,q19,[x14]	\n\t"\
		"stp	q6,q7,[x2]			\n\t	fmul	v16.2d,v14.2d,v18.2d	\n\t"\
		"ldp	q8,q9,[x4,#0x20]	\n\t	fmul	v17.2d,v15.2d,v18.2d	\n\t"\
		"fmul	v6.2d,v3.2d,v8.2d	\n\t	fmla	v16.2d,v15.2d,v19.2d	\n\t"\
		"fmul	v7.2d,v1.2d,v8.2d	\n\t	fmls	v17.2d,v14.2d,v19.2d	\n\t"\
		"fmla	v6.2d,v1.2d,v9.2d	\n\t	stp	q16,q17,[x12]		\n\t"\
		"fmls	v7.2d,v3.2d,v9.2d	\n\t	ldp	q18,q19,[x14,#0x20]	\n\t"\
		"stp	q6,q7,[x1]			\n\t	fmul	v16.2d,v13.2d,v18.2d	\n\t"\
		"ldp	q8,q9,[x4,#0x40]	\n\t	fmul	v17.2d,v11.2d,v18.2d	\n\t"\
		"fmul	v6.2d,v0.2d,v8.2d	\n\t	fmla	v16.2d,v11.2d,v19.2d	\n\t"\
		"fmul	v7.2d,v2.2d,v8.2d	\n\t	fmls	v17.2d,v13.2d,v19.2d	\n\t"\
		"fmla	v6.2d,v2.2d,v9.2d	\n\t	stp	q16,q17,[x11]		\n\t"\
		"fmls	v7.2d,v0.2d,v9.2d	\n\t	ldp	q18,q19,[x14,#0x40]	\n\t"\
		"stp	q6,q7,[x3]			\n\t	fmul	v16.2d,v10.2d,v18.2d	\n\t"\
		"									fmul	v17.2d,v12.2d,v18.2d	\n\t"\
		"									fmla	v16.2d,v12.2d,v19.2d	\n\t"\
		"									fmls	v17.2d,v10.2d,v19.2d	\n\t"\
		"									stp	q16,q17,[x13]		\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__in1] "m" (Xin1)\
		 ,[__in2] "m" (Xin2)\
		 ,[__in3] "m" (Xin3)\
		/* idiff, 'input-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's inputs */\
		 ,[__idiff] "m" (Xidiff)\
		 ,[__two] "m" (Xtwo)	/* pointer to vector-const 2.0 */\
		 ,[__cc0] "m" (Xcc0)\
		/* roff, 'roots-address offset', has literal-bytewise address offset between ptrs to 1st,2nd DFT's twiddles */\
		 ,[__roff] "m" (Xroff)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		/* odiff, 'output-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's outputs */\
		 ,[__odiff] "m" (Xodiff)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11",\
		"x10","x11","x12","x13","x14","x15","v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v22","v23"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_04_DIT_3TWIDDLE_X1(Xin0,Xin1,Xin2,Xin3,Xidiff, Xtwo,Xcc0,Xroff, Xout0,Xout1,Xout2,Xout3,Xodiff)\
	{\
	__asm__ volatile (\
		"ldr	x4,%[__cc0]			\n\t"\
		"ldr	x0,%[__in0]			\n\t	ldp	q0,q1,[x0]		\n\t"/* Ar,i0 */\
		"ldr	x1,%[__in1]			\n\t	ldp	q2,q3,[x1] 		\n\t"/* Ar,i1 */\
		"ldr	x2,%[__in2]			\n\t	ldp	q4,q5,[x2]		\n\t"/* Ar,i2 */\
		"ldr	x3,%[__in3]			\n\t	ldp	q6,q7,[x3]		\n\t"/* Ar,i3 */\
		"fsub	v8.2d, v0.2d,v2.2d	\n\t"/* tr1 = Ar0 - Ar1 */\
		"fsub	v9.2d, v1.2d,v3.2d	\n\t"/* ti1 = Ai0 - Ai1 */\
		"fsub	v10.2d,v4.2d,v6.2d	\n\t"/* tr3 = Ar2 - Ar3 */\
		"fsub	v11.2d,v5.2d,v7.2d	\n\t"/* ti3 = Ai2 - Ai3 */\
		"fadd	v2.2d,v0.2d,v2.2d	\n\t"/* tr0 = Ar0 + Ar1 */\
		"fadd	v3.2d,v1.2d,v3.2d	\n\t"/* ti0 = Ai0 + Ai1 */\
		"fadd	v6.2d,v4.2d,v6.2d	\n\t"/* tr2 = Ar2 + Ar3 */\
		"fadd	v7.2d,v5.2d,v7.2d	\n\t"/* ti2 = Ai2 + Ai3 */\
	/*	Br0 = tr0 + tr2;			Bi0 = ti0 + ti2;
		tr0 = tr0 - tr2;			ti0 = ti0 - ti2;
		r = tr3;tr3 = tr1 - ti3;	tr1 = tr1 + ti3;
				ti3 = ti1 + r  ;	ti1 = ti1 - r  ;	*/\
		"fsub	v4.2d,v2.2d,v6.2d	\n\t"/* tr0 = tr0 - tr2 */\
		"fsub	v5.2d,v3.2d,v7.2d	\n\t"/* ti0 = ti0 - ti2 */\
		"fsub	v0.2d,v8.2d,v11.2d	\n\t"/* tr3 = tr1 - ti3 */\
		"fsub	v1.2d,v9.2d,v10.2d	\n\t"/* ti1 = ti1 - tr3 */\
		"fadd	v6.2d,v2.2d,v6.2d	\n\t"/* Br0 = tr0 + tr2 */\
		"fadd	v7.2d,v3.2d,v7.2d	\n\t"/* Bi0 = ti0 + ti2 */\
		"fadd	v3.2d,v8.2d,v11.2d	\n\t"/* tr1 = tr1 + ti3 */\
		"fadd	v2.2d,v9.2d,v10.2d	\n\t"/* ti3 = ti1 + tr3 */\
		"ldr	x0,%[__out0]		\n\t"\
		"ldr	x1,%[__out1]		\n\t"\
		"ldr	x2,%[__out2]		\n\t"\
		"ldr	x3,%[__out3]		\n\t"\
		"stp	q6,q7,[x0]			\n\t"/* out 0 */\
	/* Br2 = tr0*c2 + ti0*s2;	Bi2 = ti0*c2 - tr0*s2;	// twiddle = ~w2 = c2-I.s1 */\
		"ldp	q8,q9,[x4]			\n\t"/* c2 */\
		"fmul	v6.2d,v4.2d,v8.2d	\n\t"/* twiddle-mul: tr,i0 in v4,5 */\
		"fmul	v7.2d,v5.2d,v8.2d	\n\t"\
		"fmla	v6.2d,v5.2d,v9.2d	\n\t"\
		"fmls	v7.2d,v4.2d,v9.2d	\n\t"\
		"stp	q6,q7,[x2]			\n\t"/* out 2 */\
	/* Br1 = tr1*c1 + ti1*s1;	Bi1 = ti1*c1 - tr1*s1;	// twiddle = ~w1 = c1-I.s1 */\
		"ldp	q8,q9,[x4,#0x20]	\n\t"/* c1 */\
		"fmul	v6.2d,v3.2d,v8.2d	\n\t"/* twiddle-mul: tr,i1 in v3,1 */\
		"fmul	v7.2d,v1.2d,v8.2d	\n\t"\
		"fmla	v6.2d,v1.2d,v9.2d	\n\t"\
		"fmls	v7.2d,v3.2d,v9.2d	\n\t"\
		"stp	q6,q7,[x1]			\n\t"/* out 1 */\
	/* Br3 = tr3*c3 + ti3*s3;	Bi3 = ti3*c3 - tr3*s3;	// twiddle = ~w3 = c3-I.s3 */\
		"ldp	q8,q9,[x4,#0x40]	\n\t"/* c3 */\
		"fmul	v6.2d,v0.2d,v8.2d	\n\t"/* twiddle-mul: tr,i3 in v0,2 */\
		"fmul	v7.2d,v2.2d,v8.2d	\n\t"\
		"fmla	v6.2d,v2.2d,v9.2d	\n\t"\
		"fmls	v7.2d,v0.2d,v9.2d	\n\t"\
		"stp	q6,q7,[x3]			\n\t"/* out 3 */\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__in1] "m" (Xin1)\
		 ,[__in2] "m" (Xin2)\
		 ,[__in3] "m" (Xin3)\
		/* idiff, 'input-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's inputs */\
		 ,[__idiff] "m" (Xidiff)\
		 ,[__two] "m" (Xtwo)	/* pointer to vector-const 2.0 */\
		 ,[__cc0] "m" (Xcc0)\
		/* roff, 'roots-address offset', has literal-bytewise address offset between ptrs to 1st,2nd DFT's twiddles */\
		 ,[__roff] "m" (Xroff)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		/* odiff, 'output-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's outputs */\
		 ,[__odiff] "m" (Xodiff)\
		: "cc","memory","x0","x1","x2","x3","x4", "v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_05_DFT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4, Xcc1, Xo0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__i0]		\n\t"\
		"ldr	x1,%[__i1]		\n\t"\
		"ldr	x2,%[__i2]		\n\t"\
		"ldr	x3,%[__i3]		\n\t"\
		"ldr	x4,%[__i4]		\n\t"\
		"ldr	x5,%[__o0]		\n\t"\
		"ldp	q0,q1,[x1]			\n\t"\
		"ldp	q6,q7,[x4]			\n\t"\
/*6*/	"fadd	v8.2d,v0.2d,v6.2d	\n\t"\
/*7*/	"fadd	v9.2d,v1.2d,v7.2d	\n\t"\
		"fsub	v0.2d,v0.2d,v6.2d	\n\t"\
		"fsub	v1.2d,v1.2d,v7.2d	\n\t"\
		"ldp	q2,q3,[x2]			\n\t"\
		"ldp	q6,q7,[x3]			\n\t"\
		"fadd	v4.2d,v2.2d,v6.2d	\n\t"\
		"fadd	v5.2d,v3.2d,v7.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v6.2d	\n\t"\
		"fsub	v3.2d,v3.2d,v7.2d	\n\t"\
		"ldr	x6,%[__cc1]		\n\t"\
		"fsub	v6.2d,v8.2d,v4.2d	\n\t"\
		"fsub	v7.2d,v9.2d,v5.2d	\n\t"\
		"fadd	v4.2d,v8.2d,v4.2d	\n\t"\
		"fadd	v5.2d,v9.2d,v5.2d	\n\t"\
		"ldp	q10,q11,[x6]	\n\t	add x6,x6,#0x20	\n\t"\
		"ldp	q8,q9,[x0]			\n\t"\
		"fadd	v8.2d,v4.2d,v8.2d	\n\t"\
		"fadd	v9.2d,v5.2d,v9.2d	\n\t"\
		"fmul	v6.2d,v6.2d,v11.2d	\n\t"\
		"fmul	v7.2d,v7.2d,v11.2d	\n\t"\
		"stp	q8,q9,[x5]			\n\t"/* o0 */\
/*4*/	"fmla	v8.2d,v4.2d,v10.2d	\n\t"\
/*5*/	"fmla	v9.2d,v5.2d,v10.2d	\n\t"\
		"fsub	v4.2d,v8.2d,v6.2d	\n\t"\
		"fsub	v5.2d,v9.2d,v7.2d	\n\t"\
		"fadd	v6.2d,v8.2d,v6.2d	\n\t"\
		"fadd	v7.2d,v9.2d,v7.2d	\n\t"\
		"ld1r	{v10.2d},[x6]		\n\t"\
		"ldp	q11,q9,[x6,0x10]	\n\t"\
		"fmul	v8.2d,v0.2d,v9.2d	\n\t"\
		"fmul	v9.2d,v1.2d,v9.2d	\n\t"\
		"fsub	v0.2d,v0.2d,v2.2d	\n\t"\
		"fsub	v1.2d,v1.2d,v3.2d	\n\t"\
		"fmul	v0.2d,v0.2d,v10.2d	\n\t"\
		"fmul	v1.2d,v1.2d,v10.2d	\n\t"\
		"fmul	v2.2d,v2.2d,v11.2d	\n\t"\
		"fmul	v3.2d,v3.2d,v11.2d	\n\t"\
		"fadd	v2.2d,v2.2d,v0.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v1.2d	\n\t"\
		"fsub	v0.2d,v0.2d,v8.2d	\n\t"\
		"fsub	v1.2d,v1.2d,v9.2d	\n\t"\
		"ldr	x1,%[__o1]		\n\t"\
		"ldr	x4,%[__o4]		\n\t"\
/*6*/	"fsub	v8.2d,v6.2d,v3.2d	\n\t"\
/*7*/	"fsub	v9.2d,v7.2d,v2.2d	\n\t"\
		"fadd	v3.2d,v6.2d,v3.2d	\n\t"\
		"fadd	v2.2d,v7.2d,v2.2d	\n\t"\
		"stp	q8,q2,[x1]			\n\t"/* o1 */\
		"stp	q3,q9,[x4]			\n\t"/* o4 */\
		"ldr	x2,%[__o2]		\n\t"\
		"ldr	x3,%[__o3]		\n\t"\
/*4*/	"fsub	v8.2d,v4.2d,v1.2d	\n\t"\
/*5*/	"fsub	v9.2d,v5.2d,v0.2d	\n\t"\
		"fadd	v1.2d,v4.2d,v1.2d	\n\t"\
		"fadd	v0.2d,v5.2d,v0.2d	\n\t"\
		"stp	q8,q0,[x2]			\n\t"/* o2 */\
		"stp	q1,q9,[x3]			\n\t"/* o3 */\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11"	/* Clobbered registers */\
	);\
	}

	/* Does 2 of the above 5-DFTs side-by-side: */
	#define SSE2_RADIX_05_DFT_0TWIDDLE_X2(Xcc1,Xtwo, Xi0,Xi1,Xi2,Xi3,Xi4, Xo0,Xo1,Xo2,Xo3,Xo4, Xj0,Xj1,Xj2,Xj3,Xj4, Xu0,Xu1,Xu2,Xu3,Xu4)\
	{\
		SSE2_RADIX_05_DFT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4, Xcc1, Xo0,Xo1,Xo2,Xo3,Xo4);\
		SSE2_RADIX_05_DFT_0TWIDDLE(Xj0,Xj1,Xj2,Xj3,Xj4, Xcc1, Xu0,Xu1,Xu2,Xu3,Xu4);\
	}

	#define SSE2_RADIX_07_DFT(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6, Xcc, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6)\
	{\
	__asm__ volatile (		/*** Rcol does Imaginary Parts: ***/\
	/* Load what are vec-regs 4-6,12-14 in AVX2 version of 7-DFT macro into temp-regs v0,7,8,15,16,17 here:*/\
		"ldr	x1,%[__i1]		\n\t	ldp	q8,q17,[x1]	\n\t"/* x1 */\
		"ldr	x6,%[__i6]		\n\t	ldp	q1,q9 ,[x6]	\n\t"/* x6 */\
		"ldr	x2,%[__i2]		\n\t	ldp	q7,q16,[x2]	\n\t"/* x2 */\
		"ldr	x5,%[__i5]		\n\t	ldp	q2,q10,[x5]	\n\t"/* x5 */\
		"ldr	x3,%[__i3]		\n\t	ldp	q0,q15,[x3]	\n\t"/* x3 */\
		"ldr	x4,%[__i4]		\n\t	ldp	q3,q11,[x4]	\n\t"/* x4 */\
		"ldr	x0,%[__i0]		\n\t"\
		"fsub	v6.2d,v8.2d,v1.2d	\n\t	fsub	v14.2d,v17.2d,v9.2d 		\n\t"	/* t6 = x1 - x6 */\
		"fsub	v5.2d,v7.2d,v2.2d	\n\t	fsub	v13.2d,v16.2d,v10.2d		\n\t"	/* t5 = x2 - x5 */\
		"fsub	v4.2d,v0.2d,v3.2d	\n\t	fsub	v12.2d,v15.2d,v11.2d		\n\t"	/* t4 = x3 - x4 */\
		"fadd	v1.2d,v8.2d,v1.2d	\n\t	fadd	v9.2d ,v17.2d,v9.2d 		\n\t"	/* t1 = x1 + x6 */\
		"fadd	v2.2d,v7.2d,v2.2d	\n\t	fadd	v10.2d,v16.2d,v10.2d		\n\t"	/* t2 = x2 + x5 */\
		"fadd	v3.2d,v0.2d,v3.2d	\n\t	fadd	v11.2d,v15.2d,v11.2d		\n\t"	/* t3 = x3 + x4 */\
		"ldp	q0,q8,[x0] 	\n\t"	/* t0 = x0 */\
	/* Spill  xi - xj combos to o-slots; these won't be needed until we get to the sine terms: */\
		"ldr	x1,%[__o1]		\n\t	stp	q6,q14,[x1]		\n\t"/* t6 */\
		"ldr	x2,%[__o2]		\n\t	stp	q5,q13,[x2]		\n\t"/* t5 */\
		"ldr	x3,%[__o3]		\n\t	stp	q4,q12,[x3]		\n\t"/* t4 */\
		"mov	v6.16b,v0.16b	\n\t	mov	v14.16b,v8.16b	\n\t"/* Br0 = t0 [only show real parts in comments] */\
		"mov	v5.16b,v0.16b	\n\t	mov	v13.16b,v8.16b	\n\t"/* rt  = t0 */\
		"mov	v4.16b,v0.16b	\n\t	mov	v12.16b,v8.16b	\n\t"/* re  = t0 */\
		"ldr	x5,%[__cc]		\n\t"\
		"ldr	x0,%[__o0]		\n\t"\
		"ldp q16,q17,[x5      ]	\n\t"/* cc1,ss1 */\
		"ldp q7 ,q18,[x5,#0x20]	\n\t	ldp q15,q19,[x5,#0x40]	\n\t"/* cc2,cc3 ... also preload ss2,ss3 into q18,19 */\
		"fmla	v5.2d,v1.2d,v16.2d	\n\t	fmla	v13.2d,v9.2d ,v16.2d		\n\t"/* rt  = FMADD[cc1,tr1, rt ]; */\
		"fmla	v4.2d,v1.2d,v7.2d 	\n\t	fmla	v12.2d,v9.2d ,v7.2d 		\n\t"/* re  = FMADD[cc2,tr1, re ]; */\
		"fmla	v0.2d,v1.2d,v15.2d	\n\t	fmla	v8.2d ,v9.2d ,v15.2d		\n\t"/* tr0 = FMADD[cc3,tr1, tr0]; */\
		"fadd	v6.2d,v6.2d,v1.2d 	\n\t	fadd	v14.2d,v14.2d,v9.2d 		\n\t"/* Br0 += tr1; */\
	\
		"fmla	v5.2d,v2.2d,v7.2d 	\n\t	fmla	v13.2d,v10.2d,v7.2d 		\n\t"/* rt  = FMADD[cc2,tr2, rt ]; */\
		"fmla	v4.2d,v2.2d,v15.2d	\n\t	fmla	v12.2d,v10.2d,v15.2d		\n\t"/* re  = FMADD[cc3,tr2, re ]; */\
		"fmla	v0.2d,v2.2d,v16.2d	\n\t	fmla	v8.2d ,v10.2d,v16.2d		\n\t"/* tr0 = FMADD[cc1,tr2, tr0]; */\
		"fadd	v6.2d,v6.2d,v2.2d 	\n\t	fadd	v14.2d,v14.2d,v10.2d		\n\t"/* Br0 += tr2; */\
	\
		"fmla	v5.2d,v3.2d,v15.2d	\n\t	fmla	v13.2d,v11.2d,v15.2d		\n\t"/* rt  = FMADD[cc3,tr3, rt ]; */\
		"fmla	v4.2d,v3.2d,v16.2d	\n\t	fmla	v12.2d,v11.2d,v16.2d		\n\t"/* re  = FMADD[cc1,tr3, re ]; */\
		"fmla	v0.2d,v3.2d,v7.2d 	\n\t	fmla	v8.2d ,v11.2d,v7.2d 		\n\t"/* tr0 = FMADD[cc2,tr3, tr0]; */\
		"fadd	v6.2d,v6.2d,v3.2d 	\n\t	fadd	v14.2d,v11.2d,v14.2d		\n\t"/* Br0 += tr3; */\
		"stp	q6,q14,[x0]	\n\t"/* B0 */\
	\
		"mov	v7.16b,v18.16b	\n\t	mov	v15.16b,v19.16b		\n\t"/* ss2,ss3 */\
		"ldp	q1,q9 ,[x1] 		\n\t"/* Restore: tr1 = tr6 */\
		"ldp	q2,q10,[x1] 		\n\t"/* tr2 = tr6 */\
		"ldp	q3,q11,[x1] 		\n\t"/* tr3 = tr6 */\
	\
		"fmul	v1.2d,v1.2d,v17.2d	\n\t	fmul	v9.2d ,v9.2d ,v17.2d	\n\t"/* tr1 = ss1*tr6; */\
		"fmul	v2.2d,v2.2d,v7.2d 	\n\t	fmul	v10.2d,v10.2d,v7.2d 	\n\t"/* tr2 = ss2*tr6; */\
		"fmul	v3.2d,v3.2d,v15.2d	\n\t	fmul	v11.2d,v11.2d,v15.2d	\n\t"/* tr3 = ss3*tr6; */\
		"ldp	q6,q14,[x2]			\n\t"/* Restore t5 */\
		"fmla	v1.2d,v6.2d,v7.2d 	\n\t	fmla	v9.2d ,v14.2d,v7.2d 	\n\t"/* tr1 =  FMADD[ss2,tr5, tr1]; */\
		"fmls	v2.2d,v6.2d,v15.2d	\n\t	fmls	v10.2d,v14.2d,v15.2d	\n\t"/* tr2 = FNMADD[ss3,tr5, tr2]; */\
		"fmls	v3.2d,v6.2d,v17.2d	\n\t	fmls	v11.2d,v14.2d,v17.2d	\n\t"/* tr3 = FNMADD[ss1,tr5, tr3]; */\
		"ldp	q6,q14,[x3]			\n\t"/* Restore t4 */\
		"fmla	v1.2d,v6.2d,v15.2d	\n\t	fmla	v9.2d ,v14.2d,v15.2d	\n\t"/* tr1 =  FMADD[ss3,tr4, tr1]; */\
		"fmls	v2.2d,v6.2d,v17.2d	\n\t	fmls	v10.2d,v14.2d,v17.2d	\n\t"/* tr2 = FNMADD[ss1,tr4, tr2]; */\
		"fmla	v3.2d,v6.2d,v7.2d 	\n\t	fmla	v11.2d,v14.2d,v7.2d 	\n\t"/* tr3 =  FMADD[ss2,tr4, tr3]; */\
	\
		"ldr	x4,%[__o4]		\n\t"\
		"ldr	x5,%[__o5]		\n\t"\
		"ldr	x6,%[__o6]		\n\t"\
	/* Output permutation causes signs to get flipped here; write what are vec-regs 0,4,5,8,12,13 in AVX2 mode to temp-regs 6,7,14,15,16,17 here: */\
		"fsub	v14.2d,v5.2d,v9.2d 	\n\t	fsub	v17.2d,v13.2d,v1.2d	\n\t"/* Br1 = rt  - ti1;	Bi6 = it  - tr1; */\
		"fsub	v7.2d ,v4.2d,v10.2d	\n\t	fsub	v16.2d,v12.2d,v2.2d	\n\t"/* Br2 = re  - ti2;	Bi5 = im  - tr2; */\
		"fsub	v6.2d ,v0.2d,v11.2d	\n\t	fsub	v15.2d,v8.2d ,v3.2d	\n\t"/* Br3 = tr0 - ti3;	Bi4 = ti0 - tr3; */\
		"fadd	v9.2d ,v5.2d,v9.2d 	\n\t	fadd	v1.2d ,v13.2d,v1.2d	\n\t"/* Br6 = rt  + ti1;	Bi1 = it  + tr1; */\
		"fadd	v10.2d,v4.2d,v10.2d	\n\t	fadd	v2.2d ,v12.2d,v2.2d	\n\t"/* Br5 = re  + ti2;	Bi2 = im  + tr2; */\
		"fadd	v11.2d,v0.2d,v11.2d	\n\t	fadd	v3.2d ,v8.2d ,v3.2d	\n\t"/* Br4 = tr0 + ti3;	Bi3 = ti0 + tr3; */\
		"stp	q14,q1 ,[x1]	\n\t"/* B1 */\
		"stp	q7 ,q2 ,[x2]	\n\t"/* B2 */\
		"stp	q6 ,q3 ,[x3]	\n\t"/* B3 */\
		"stp	q11,q15,[x4]	\n\t"/* B4 */\
		"stp	q10,q16,[x5]	\n\t"/* B5 */\
		"stp	q9 ,q17,[x6]	\n\t"/* B6 */\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__cc] "m" (Xcc)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6",\
		"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16","v17","v18","v19"	/* Clobbered registers */\
	);\
	}

	/* Twiddleless version of SSE2_RADIX8_DIF_TWIDDLE. Inputs enter in memory locations __r0 + [__i1,__i2,__i3,__i4,__i5,__i6,__i7],;
	where r0 is a memory address and the i's are LITERAL [BYTE] OFFSETS. Outputs go into memory locations __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7, assumed disjoint with inputs:\
	*/
	#define SSE2_RADIX8_DIF_0TWIDDLE(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
	{\
	__asm__ volatile (\
		"ldr	x0,%[__r0]			\n\t"\
		"ldr	w1,%[__i2]			\n\t	add	x1,x0,x1			\n\t"\
		"ldr	w2,%[__i4]			\n\t	add	x2,x0,x2			\n\t"\
		"ldr	w3,%[__i6]			\n\t	add	x3,x0,x3			\n\t"\
		"ldr	w4,%[__i1]			\n\t	add	x4,x0,x4			\n\t"\
		"ldr	w5,%[__i3]			\n\t	add	x5,x0,x5			\n\t"\
		"ldr	w6,%[__i5]			\n\t	add	x6,x0,x6			\n\t"\
		"ldr	w7,%[__i7]			\n\t	add	x7,x0,x7			\n\t"\
		/*...Block 1: */\
		"ldp	q0,q1,[x0]			\n\t	ldp	q12,q13,[x4]				\n\t"\
		"ldp	q4,q5,[x2]			\n\t	ldp	q16,q17,[x6]				\n\t"\
		"fsub	v2.2d,v0.2d,v4.2d	\n\t	fsub	v14.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v5.2d	\n\t	fsub	v15.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"ldp	q4,q5,[x1]			\n\t	ldp	q16,q17,[x5]				\n\t"\
		"ldp	q8,q9,[x3]			\n\t	ldp	q20,q21,[x7]				\n\t"\
		"fsub	v6.2d,v4.2d,v8.2d	\n\t	fsub	v18.2d,v16.2d,v20.2d	\n\t"\
		"fsub	v7.2d,v5.2d,v9.2d	\n\t	fsub	v19.2d,v17.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v4.2d,v8.2d	\n\t	fadd	v16.2d,v16.2d,v20.2d	\n\t"\
		"fadd	v5.2d,v5.2d,v9.2d	\n\t	fadd	v17.2d,v17.2d,v21.2d	\n\t"\
		"ldr	x10,%[__isrt2]		\n\t	ld1r	{v29.2d},[x10]		\n\t"\
		/* combine to get 2 length-4 output subtransforms... */\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v17.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	fsub	v16.2d,v15.2d,v18.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
		/* v6,7,18,19 free */\
		"fsub	v6.2d,v0.2d,v12.2d	\n\t	ldr	x0,%[__o0]	\n\t	fsub	v18.2d,v14.2d,v15.2d	\n\t"\
		"fsub	v7.2d,v1.2d,v13.2d	\n\t	ldr	x1,%[__o1]	\n\t	fadd	v19.2d,v14.2d,v15.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v12.2d	\n\t	ldr	x2,%[__o2]	\n\t	fmul	v18.2d,v18.2d,v29.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v13.2d	\n\t	ldr	x3,%[__o3]	\n\t	fmul	v19.2d,v19.2d,v29.2d	\n\t"\
		"fsub	v12.2d,v8.2d,v21.2d	\n\t	ldr	x4,%[__o4]	\n\t	fadd	v14.2d,v16.2d,v17.2d	\n\t"\
		"fsub	v13.2d,v9.2d,v20.2d	\n\t	ldr	x5,%[__o5]	\n\t	fsub	v15.2d,v16.2d,v17.2d	\n\t"\
		"fadd	v8.2d,v8.2d,v21.2d	\n\t	ldr	x6,%[__o6]	\n\t	fmul	v14.2d,v14.2d,v29.2d	\n\t"\
		"fadd	v9.2d,v9.2d,v20.2d	\n\t	ldr	x7,%[__o7]	\n\t	fmul	v15.2d,v15.2d,v29.2d	\n\t"\
		"fsub	v16.2d,v2.2d ,v18.2d\n\t	stp	q0 ,q1 ,[x0]	\n\t"\
		"fsub	v17.2d,v3.2d ,v19.2d\n\t	stp	q6 ,q7 ,[x1]	\n\t"\
		"fadd	v2.2d ,v2.2d ,v18.2d\n\t	stp	q12,q9 ,[x2]	\n\t"\
		"fadd	v3.2d ,v3.2d ,v19.2d\n\t	stp	q8 ,q13,[x3]	\n\t"\
		"fsub	v18.2d,v4.2d ,v14.2d\n\t	stp	q2, q3 ,[x4]	\n\t"\
		"fsub	v19.2d,v5.2d ,v15.2d\n\t	stp	q16,q17,[x5]	\n\t"\
		"fadd	v4.2d ,v4.2d ,v14.2d\n\t	stp	q18,q19,[x6]	\n\t"\
		"fadd	v5.2d ,v5.2d ,v15.2d\n\t	stp	q4 ,q5 ,[x7]	\n\t"\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x10",\
		"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v29"	/* Clobbered registers */\
	);\
	}

	// ARM inlines-asm disallows literal-byte args, so 2 separate macros unneeded here:
	#define SSE2_RADIX8_DIF_0TWIDDLE_B(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
			SSE2_RADIX8_DIF_0TWIDDLE(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)

	/* Twiddleless version of SSE2_RADIX8_DIT_TWIDDLE. Inputs enter in memory locations __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7.
	Outputs go into 16 contiguous 32-byte memory locations starting at __out and assumed disjoint with inputs.
	This macro built on the same code template as SSE2_RADIX8_DIF_TWIDDLE0, but with the I/O-location indices mutually bit reversed:
	01234567 <--> 04261537, which can be effected via the pairwise swaps 1 <--> 4 and 3 <--> 6.
	*/
	#define	SSE2_RADIX8_DIT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xout, Xisrt2)\
	{\
	__asm__ volatile (\
		"ldr x14,%[__out]	\n\t	ldr x10,%[__isrt2]	\n\t	ld1r {v29.2d},[x10]	\n\t"\
	/* 1st of 2 radix-4 subtransforms: *//* 2nd of 2 radix-4 subtransforms: */\
		"ldr	x0,%[__i0]			\n\t	ldr	x4,%[__i4]			\n\t"\
		"ldr	x1,%[__i1]			\n\t	ldr	x5,%[__i5]			\n\t"\
		"ldr	x2,%[__i2]			\n\t	ldr	x6,%[__i6]			\n\t"\
		"ldr	x3,%[__i3]			\n\t	ldr	x7,%[__i7]			\n\t"\
		"ldp	q0,q1,[x0]			\n\t	ldp	q12,q13,[x4]			\n\t"\
		"ldp	q8,q9,[x1]			\n\t	ldp	q20,q21,[x5]			\n\t"\
		"fsub	v2.2d,v0.2d,v8.2d	\n\t	fsub	v14.2d,v12.2d,v20.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v9.2d	\n\t	fsub	v15.2d,v13.2d,v21.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v8.2d	\n\t	fadd	v12.2d,v12.2d,v20.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v9.2d	\n\t	fadd	v13.2d,v13.2d,v21.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t	ldp	q16,q17,[x6]			\n\t"\
		"ldp	q8,q9,[x3]			\n\t	ldp	q20,q21,[x7]			\n\t"\
		"fsub	v6.2d,v4.2d,v8.2d	\n\t	fsub	v18.2d,v16.2d,v20.2d	\n\t"\
		"fsub	v7.2d,v5.2d,v9.2d	\n\t	fsub	v19.2d,v17.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v4.2d,v8.2d	\n\t	fadd	v16.2d,v16.2d,v20.2d	\n\t"\
		"fadd	v5.2d,v5.2d,v9.2d	\n\t	fadd	v17.2d,v17.2d,v21.2d	\n\t"\
		/* combine to get the 2 length-4 transforms: */\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v16.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	fsub	v17.2d,v15.2d,v18.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
		/* now combine the two half-transforms: */\
		"fadd	v6.2d,v0.2d,v12.2d	\n\t	fadd	v18.2d,v1.2d,v13.2d	\n\t"\
		"fsub	v7.2d,v0.2d,v12.2d	\n\t	fsub	v19.2d,v1.2d,v13.2d	\n\t"\
		"stp	q6,q18,[x14]		\n\t	stp	q7,q19,[x14,#0x80]		\n\t"\
		"fadd	v6.2d,v16.2d,v17.2d	\n\t	fsub	v7.2d,v16.2d,v17.2d	\n\t"\
		"fmul	v16.2d,v29.2d,v6.2d	\n\t	fmul	v17.2d,v29.2d,v7.2d	\n\t"\
		"fadd	v6.2d,v4.2d,v16.2d	\n\t	fsub	v18.2d,v5.2d,v17.2d	\n\t"\
		"fsub	v7.2d,v4.2d,v16.2d	\n\t	fadd	v19.2d,v5.2d,v17.2d	\n\t"\
		"stp	q6,q18,[x14,#0xe0]	\n\t	stp	q7,q19,[x14,#0x60]		\n\t"\
		"fadd	v6.2d,v8.2d,v21.2d	\n\t	fsub	v18.2d,v9.2d,v20.2d	\n\t"\
		"fsub	v7.2d,v8.2d,v21.2d	\n\t	fadd	v19.2d,v9.2d,v20.2d	\n\t"\
		"stp	q6,q18,[x14,#0xc0]	\n\t	stp	q7,q19,[x14,#0x40]		\n\t"\
		"fsub	v6.2d,v14.2d,v15.2d	\n\t	fadd	v7.2d,v14.2d,v15.2d	\n\t"\
		"fmul	v14.2d,v29.2d,v6.2d	\n\t	fmul	v15.2d,v29.2d,v7.2d	\n\t"\
		"fsub	v6.2d,v2.2d,v14.2d	\n\t	fsub	v18.2d,v3.2d,v15.2d	\n\t"\
		"fadd	v7.2d,v2.2d,v14.2d	\n\t	fadd	v19.2d,v3.2d,v15.2d	\n\t"\
		"stp	q6,q18,[x14,#0xa0]	\n\t	stp	q7,q19,[x14,#0x20]		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__out] "m" (Xout)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x10","x14",\
		"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v29"	/* Clobbered registers */\
	);\
	}

	// Same as SSE2_RADIX8_DIT_0TWIDDLE but with user-specifiable [i.e. not nec. contiguous] output addresses:
	#define	SSE2_RADIX8_DIT_0TWIDDLE_OOP(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
	{\
	__asm__ volatile (\
		"ldr x10,%[__isrt2]	\n\t	ld1r {v29.2d},[x10]	\n\t"\
	/* 1st of 2 radix-4 subtransforms: *//* 2nd of 2 radix-4 subtransforms: */\
		"ldr	x0,%[__i0]			\n\t	ldr	x4,%[__i4]			\n\t"\
		"ldr	x1,%[__i1]			\n\t	ldr	x5,%[__i5]			\n\t"\
		"ldr	x2,%[__i2]			\n\t	ldr	x6,%[__i6]			\n\t"\
		"ldr	x3,%[__i3]			\n\t	ldr	x7,%[__i7]			\n\t"\
		"ldp	q0,q1,[x0]			\n\t	ldp	q12,q13,[x4]			\n\t"\
		"ldp	q8,q9,[x1]			\n\t	ldp	q20,q21,[x5]			\n\t"\
		"fsub	v2.2d,v0.2d,v8.2d	\n\t	fsub	v14.2d,v12.2d,v20.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v9.2d	\n\t	fsub	v15.2d,v13.2d,v21.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v8.2d	\n\t	fadd	v12.2d,v12.2d,v20.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v9.2d	\n\t	fadd	v13.2d,v13.2d,v21.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t	ldp	q16,q17,[x6]			\n\t"\
		"ldp	q8,q9,[x3]			\n\t	ldp	q20,q21,[x7]			\n\t"\
		"fsub	v6.2d,v4.2d,v8.2d	\n\t	fsub	v18.2d,v16.2d,v20.2d	\n\t"\
		"fsub	v7.2d,v5.2d,v9.2d	\n\t	fsub	v19.2d,v17.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v4.2d,v8.2d	\n\t	fadd	v16.2d,v16.2d,v20.2d	\n\t"\
		"fadd	v5.2d,v5.2d,v9.2d	\n\t	fadd	v17.2d,v17.2d,v21.2d	\n\t"\
		/* combine to get the 2 length-4 transforms: */\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v16.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	fsub	v17.2d,v15.2d,v18.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
		/* now combine the two half-transforms: */\
		"ldr	x0,%[__o0]			\n\t	ldr	x4,%[__o4]			\n\t"\
		"ldr	x1,%[__o1]			\n\t	ldr	x5,%[__o5]			\n\t"\
		"ldr	x2,%[__o2]			\n\t	ldr	x6,%[__o6]			\n\t"\
		"ldr	x3,%[__o3]			\n\t	ldr	x7,%[__o7]			\n\t"\
		"fadd	v6.2d,v0.2d,v12.2d	\n\t	fadd	v18.2d,v1.2d,v13.2d	\n\t"\
		"fsub	v7.2d,v0.2d,v12.2d	\n\t	fsub	v19.2d,v1.2d,v13.2d	\n\t"\
		"stp	q6,q18,[x0]			\n\t	stp	q7,q19,[x4]			\n\t"\
		"fadd	v6.2d,v16.2d,v17.2d	\n\t	fsub	v7.2d,v16.2d,v17.2d	\n\t"\
		"fmul	v16.2d,v29.2d,v6.2d	\n\t	fmul	v17.2d,v29.2d,v7.2d	\n\t"\
		"fadd	v6.2d,v4.2d,v16.2d	\n\t	fsub	v18.2d,v5.2d,v17.2d	\n\t"\
		"fsub	v7.2d,v4.2d,v16.2d	\n\t	fadd	v19.2d,v5.2d,v17.2d	\n\t"\
		"stp	q6,q18,[x7]			\n\t	stp	q7,q19,[x3]			\n\t"\
		"fadd	v6.2d,v8.2d,v21.2d	\n\t	fsub	v18.2d,v9.2d,v20.2d	\n\t"\
		"fsub	v7.2d,v8.2d,v21.2d	\n\t	fadd	v19.2d,v9.2d,v20.2d	\n\t"\
		"stp	q6,q18,[x6]			\n\t	stp	q7,q19,[x2]			\n\t"\
		"fsub	v6.2d,v14.2d,v15.2d	\n\t	fadd	v7.2d,v14.2d,v15.2d	\n\t"\
		"fmul	v14.2d,v29.2d,v6.2d	\n\t	fmul	v15.2d,v29.2d,v7.2d	\n\t"\
		"fsub	v6.2d,v2.2d,v14.2d	\n\t	fsub	v18.2d,v3.2d,v15.2d	\n\t"\
		"fadd	v7.2d,v2.2d,v14.2d	\n\t	fadd	v19.2d,v3.2d,v15.2d	\n\t"\
		"stp	q6,q18,[x5]			\n\t	stp	q7,q19,[x1]			\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x10",\
		"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v29"	/* Clobbered registers */\
	);\
	}

	// SSE2 analog of dft_macro.h::RADIX_08_DIF_TWIDDLE_OOP - Result of adding separate I/O addressing to
	// radix8_dif_dit_pass_gcc64.h::SSE2_RADIX8_DIF_TWIDDLE:
	/* Dec 2020: Need to cut #args for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid
	multiple versions of the macro having different arglists.
	Inputs i0-7 are always ptrs to BRed addresses add[0,4,2,6,1,5,3,7], thus we need the following bfly-address-pairs:
		lcol:			rcol:
		i0,4 = add0,1	i1,5 = add4,5
		i2,6 = add2,3	i3,7 = add6,7
	i.e. within each column we have linear address-access: lcol = in0+[0,i1,i2,i3], rcol = [in0+i4]+[0,i1,i2,i3]
	Thus, in the #arg-reduced version of the macro, input-addresses are computed from in0 and [i1,i2,i4],
	where in0 is a memory base-address and the i's are LITERAL [BYTE] OFFSETS.

	Output addresses are nonlinear first due to our main-array index-padding scheme used to avoid cache conflicts,
	and secondly because e.g. radix-192,320 use permuted O-address octets, so in place of the previous
	passing of the 8 O-addresses as separate args we now send base-address for each O-address octet and pointer
	to local length-8 array array containing the double* index-offsets, and do the output-address
	pointer arithmetic inside the macro:

	ASSUMES: (vec_dbl)ISRT2 has been loaded into SIMD register v29 via preceding call to SSE2_RADIX8_DIF_0TWIDDLE.
	*/
	#define SSE2_RADIX8_DIF_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xoff, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
	/* Compute input addresses: x0 = base-address in0; i1 is base byte-offset, no need to lshift it prior to add: */\
		"ldr	x0,%[in0]			\n\t	ldr	x8,%[twid_ptrs]		\n\t"\
		"ldr	w7,%[i1]	\n\t"/* i1-offset goes in x7, which is overwritten via sum-with-base-address last */\
		"add	x4,x0,x7,lsl #2		\n\t"/* x4 = in0+i4 */\
		"add	x1,x0,x7			\n\t	add	x5,x4,x7			\n\t"/* x[1,5] = in0 + i1,5 */\
		"add	x2,x0,x7,lsl #1		\n\t	add	x6,x4,x7,lsl #1		\n\t"/* x[2,6] = in0 + i2,6 */\
		"add	x3,x2,x7			\n\t	add	x7,x6,x7			\n\t"/* x[3,7] = in0 + i3,7 */\
		"ldp	x10,x11,[x8]		\n\t	ldp	x12,x13,[x8,#0x30]	\n\t"/* [c1,s1],[c4,s4]-pointers */\
		"									ld1r	{v16.2d},[x12]	\n\t"\
		"									ld1r	{v17.2d},[x13]	\n\t"\
		"									ldp	q12,q13,[x4]				\n\t"\
		"									ldp	q14,q15,[x5]				\n\t"\
		"ld1r	{v8.2d},[x10]		\n\t	fmul	v18.2d,v12.2d,v16.2d	\n\t"\
		"ld1r	{v9.2d},[x11]		\n\t	fmul	v19.2d,v13.2d,v16.2d	\n\t"\
		"ldp	q0,q1,[x0]			\n\t	fmls	v18.2d,v13.2d,v17.2d	\n\t"\
		"ldp	q2,q3,[x1]			\n\t	fmla	v19.2d,v12.2d,v17.2d	\n\t"\
		"fmul	v4.2d,v2.2d,v8.2d	\n\t	ldp	x12,x13,[x8,#0x40]	\n\t"/* [c5,s5]-pointers */\
		"fmul	v5.2d,v3.2d,v8.2d	\n\t"\
		"fmls	v4.2d,v3.2d,v9.2d	\n\t	ld1r	{v16.2d},[x12]	\n\t"\
		"fmla	v5.2d,v2.2d,v9.2d	\n\t	ld1r	{v17.2d},[x13]	\n\t"\
		"fsub	v2.2d,v0.2d,v4.2d	\n\t	fmul	v12.2d,v14.2d,v16.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v5.2d	\n\t	fmul	v13.2d,v15.2d,v16.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fmls	v12.2d,v15.2d,v17.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fmla	v13.2d,v14.2d,v17.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t	fsub	v14.2d,v18.2d,v12.2d	\n\t"\
		"ldp	q6,q7,[x3]			\n\t	fsub	v15.2d,v19.2d,v13.2d	\n\t"\
		"ldp	x10,x11,[x8,#0x10]	\n\t	fadd	v12.2d,v18.2d,v12.2d	\n\t"/* lcol: [c2,s2]-pointers */\
		"									fadd	v13.2d,v19.2d,v13.2d	\n\t"\
		"ld1r	{v8.2d},[x10]		\n\t	ldp	q16,q17,[x6]				\n\t"\
		"ld1r	{v9.2d},[x11]		\n\t	ldp	q18,q19,[x7]				\n\t"\
		"fmul	v10.2d,v4.2d,v8.2d	\n\t	ldp	x12,x13,[x8,#0x50]	\n\t"/* [c6,s6]-pointers */\
		"fmul	v11.2d,v5.2d,v8.2d	\n\t"\
		"fmls	v10.2d,v5.2d,v9.2d	\n\t	ld1r	{v20.2d},[x12]	\n\t"\
		"fmla	v11.2d,v4.2d,v9.2d	\n\t	ld1r	{v21.2d},[x13]	\n\t"\
		"ldp	x10,x11,[x8,#0x20]	\n\t	fmul	v22.2d,v16.2d,v20.2d	\n\t"/* lcol: [c3,s3]-pointers */\
		"									fmul	v23.2d,v17.2d,v20.2d	\n\t"\
		"ld1r	{v8.2d},[x10]		\n\t	fmls	v22.2d,v17.2d,v21.2d	\n\t"\
		"ld1r	{v9.2d},[x11]		\n\t	fmla	v23.2d,v16.2d,v21.2d	\n\t"\
		"fmul	v4.2d,v6.2d,v8.2d	\n\t	ldp	x12,x13,[x8,#0x60]	\n\t"/* [c7,s7]-pointers */\
		"fmul	v5.2d,v7.2d,v8.2d	\n\t"\
		"fmls	v4.2d,v7.2d,v9.2d	\n\t	ld1r	{v20.2d},[x12]	\n\t"\
		"fmla	v5.2d,v6.2d,v9.2d	\n\t	ld1r	{v21.2d},[x13]	\n\t"\
		"fsub	v6.2d,v10.2d,v4.2d	\n\t	fmul	v16.2d,v18.2d,v20.2d	\n\t"\
		"fsub	v7.2d,v11.2d,v5.2d	\n\t	fmul	v17.2d,v19.2d,v20.2d	\n\t"\
		"fadd	v4.2d,v10.2d,v4.2d	\n\t	fmls	v16.2d,v19.2d,v21.2d	\n\t"\
		"fadd	v5.2d,v11.2d,v5.2d	\n\t	fmla	v17.2d,v18.2d,v21.2d	\n\t"\
										"	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
										"	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
										"	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
										"	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		/* combine to get 2 length-4 output subtransforms... */\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v17.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	fsub	v16.2d,v15.2d,v18.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
	/* Compute output addresses: */\
		/* Load output base-address into x10 and offset-array pointer into x11: */\
		"ldr	x10,%[out0]			\n\t	ldr	x11,%[off]			\n\t"\
	/* v6,7,18,19 free - interleave O-address computations with floating-double vector ones */\
		"ldp	w0,w1,[x11]				\n\t	ldp	w4,w5,[x11,#0x10]	\n\t"/* off[0|1,4|5] */\
		"fsub	v6.2d,v0.2d,v12.2d		\n\t	fsub	v18.2d,v14.2d,v15.2d	\n\t"\
		"fsub	v7.2d,v1.2d,v13.2d		\n\t	fadd	v19.2d,v14.2d,v15.2d	\n\t"\
		"ldp	w2,w3,[x11,#0x08]		\n\t	ldp	w6,w7,[x11,#0x18]	\n\t"/* off[2|3,6|7] */\
		"fadd	v0.2d,v0.2d,v12.2d		\n\t	fmul	v18.2d,v18.2d,v29.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v13.2d		\n\t	fmul	v19.2d,v19.2d,v29.2d	\n\t"\
		"add	x0,x10,x0,lsl #3		\n\t	add	x4,x10,x4,lsl #3\n\t"/* x[0,4] = (double *)out0 + off[0,4] */\
		"fsub	v12.2d,v8.2d,v21.2d		\n\t	fadd	v14.2d,v16.2d,v17.2d	\n\t"\
		"add	x1,x10,x1,lsl #3		\n\t	add	x5,x10,x5,lsl #3\n\t"/* x[1,5] = (double *)out0 + off[1,5] */\
		"fsub	v13.2d,v9.2d,v20.2d		\n\t	fsub	v15.2d,v16.2d,v17.2d	\n\t"\
		"add	x2,x10,x2,lsl #3		\n\t	add	x6,x10,x6,lsl #3\n\t"/* x[2,6] = (double *)out0 + off[2,6] */\
		"fadd	v8.2d,v8.2d,v21.2d		\n\t	fmul	v14.2d,v14.2d,v29.2d	\n\t"\
		"add	x3,x10,x3,lsl #3		\n\t	add	x7,x10,x7,lsl #3\n\t"/* x[3,7] = (double *)out0 + off[3,7] */\
		"fadd	v9.2d,v9.2d,v20.2d		\n\t	fmul	v15.2d,v15.2d,v29.2d	\n\t"\
		"fsub	v16.2d,v2.2d ,v18.2d	\n\t	stp	q0 ,q1 ,[x0]	\n\t"\
		"fsub	v17.2d,v3.2d ,v19.2d	\n\t	stp	q6 ,q7 ,[x1]	\n\t"\
		"fadd	v2.2d ,v2.2d ,v18.2d	\n\t	stp	q12,q9 ,[x2]	\n\t"\
		"fadd	v3.2d ,v3.2d ,v19.2d	\n\t	stp	q8 ,q13,[x3]	\n\t"\
		"fsub	v18.2d,v4.2d ,v14.2d	\n\t	stp	q2, q3 ,[x4]	\n\t"\
		"fsub	v19.2d,v5.2d ,v15.2d	\n\t	stp	q16,q17,[x5]	\n\t"\
		"fadd	v4.2d ,v4.2d ,v14.2d	\n\t	stp	q18,q19,[x6]	\n\t"\
		"fadd	v5.2d ,v5.2d ,v15.2d	\n\t	stp	q4 ,q5 ,[x7]	\n\t"\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "m" (Xi1)\
		 ,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		 ,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
	/* Re-use first-coded DIT version of this macro, just swap FMA/FMS in CMULs, BR input order,replace DIT's v17+-v16 in final set of
	bflys with v16+-v17, and un-fiddle the Im-parts-swap of o6/o7, which also necessitated the STP instructions for those 2 outputs to
	be swapped due to register-dependencies.
		Comment: Dunno why SSE2 version of this macro had target argnames [c,s{1-7}] BRed, since not BRing them allows DIT
	sincos index-ordering to be used, but in any event undo that arglist-BRing in this ARM implementation of the macro code's: */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in x86/FMA implementations of this macro */\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x10","x11","x12","x13",\
		"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v29"	/* Clobbered registers */\
	);\
	}

	// SSE2 analog of dft_macro.h::RADIX_08_DIT_TWIDDLE_OOP - Result of sign-flippage and adding separate I/O addressing to
	// radix8_dif_dit_pass_gcc64.h::SSE2_RADIX8_DIF_TWIDDLE. We begin with the DIF macro here because we need a pre-twiddles
	// implementation for our purposes, whereas SSE2_RADIX8_DIT_TWIDDLE is post-twiddles.
	// ARMv8 SIMD opcount: [8 ldp, 8 stp, 30 ldr, 14 ld1r], 32 fadd/fsub, 52 fmul/fma
	// SSE2 SIMD Opcount: 102 load/store [30 implicit], 66 add/sub, 50 mul. Compare to DFT macros used for radix-8-pass-with-twiddles:
	// DIF opcount : 140 load/store [56 implicit], 66 add/sub, 32 mul
	// DIT opcount :  85 load/store [36 implicit], 68 add/sub, 32 mul .
	/* Dec 2020: Needed to cut #args in defs of SSE2_RADIX8_DIF_TWIDDLE_OOP and SSE2_RADIX8_DIT_TWIDDLE_OOP
	from 30 to < 24 for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid multiple versions of the
	macro having different arglists.
	The DIT version uses fixed-multiple pointer-offsets for both in-and-output address octets,
	because it is only ever used with small local data arrays as IOs, not the main residue array with its wide
	strides and index-padding scheme. So #def the needed basic strides in the 'calling' routine, then:
	I-addresses are in-order, i.e. what were formerly pointers [i0-7] are now computed as
		in0  + {0,i[1-7]} =  in0 + {0,i[1,2,2+2, 4,4+1,4+2,4+1+2]}
	O-addresses are BRed, i.e. what were formerly pointers [o0-7] are now computed as
		out0 + {0,o_off[4,2,6,1,5,3,7]} .

	ASSUMES: (vec_dbl)ISRT2 has been loaded into SIMD register v29 via preceding call to SSE2_RADIX8_DIT_0TWIDDLE_OOP.
	*/
	#define SSE2_RADIX8_DIT_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xo_off, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
	/* Compute input addresses: x0 = base-address in0; i1 is base byte-offset, no need to lshift it prior to add: */\
		"ldr	x0,%[in0]			\n\t	ldr	x8,%[twid_ptrs]		\n\t"\
		"ldr	w7,%[i1]	\n\t"/* i1-offset goes in x7, which is overwritten via sum-with-base-address last */\
		"add	x4,x0,x7,lsl #2		\n\t"/* x4 = in0+i4 */\
		"add	x1,x0,x7			\n\t	add	x5,x4,x7			\n\t"/* x[1,5] = in0 + i1,5 */\
		"add	x2,x0,x7,lsl #1		\n\t	add	x6,x4,x7,lsl #1		\n\t"/* x[2,6] = in0 + i2,6 */\
		"add	x3,x2,x7			\n\t	add	x7,x6,x7			\n\t"/* x[3,7] = in0 + i3,7 */\
		"ldp	x10,x11,[x8]		\n\t	ldp	x12,x13,[x8,#0x30]	\n\t"/* [c1,s1],[c4,s4]-pointers */\
		"									ld1r	{v16.2d},[x12]	\n\t"\
		"									ld1r	{v17.2d},[x13]	\n\t"\
		"									ldp	q12,q13,[x4]				\n\t"\
		"									ldp	q14,q15,[x5]				\n\t"\
		"ld1r	{v8.2d},[x10]		\n\t	fmul	v18.2d,v12.2d,v16.2d	\n\t"\
		"ld1r	{v9.2d},[x11]		\n\t	fmul	v19.2d,v13.2d,v16.2d	\n\t"\
		"ldp	q0,q1,[x0]			\n\t	fmla	v18.2d,v13.2d,v17.2d	\n\t"\
		"ldp	q2,q3,[x1]			\n\t	fmls	v19.2d,v12.2d,v17.2d	\n\t"\
		"fmul	v4.2d,v2.2d,v8.2d	\n\t	ldp	x12,x13,[x8,#0x40]	\n\t"/* [c5,s5]-pointers */\
		"fmul	v5.2d,v3.2d,v8.2d	\n\t"\
		"fmla	v4.2d,v3.2d,v9.2d	\n\t	ld1r	{v16.2d},[x12]	\n\t"\
		"fmls	v5.2d,v2.2d,v9.2d	\n\t	ld1r	{v17.2d},[x13]	\n\t"\
		"fsub	v2.2d,v0.2d,v4.2d	\n\t	fmul	v12.2d,v14.2d,v16.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v5.2d	\n\t	fmul	v13.2d,v15.2d,v16.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fmla	v12.2d,v15.2d,v17.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fmls	v13.2d,v14.2d,v17.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t	fsub	v14.2d,v18.2d,v12.2d	\n\t"\
		"ldp	q6,q7,[x3]			\n\t	fsub	v15.2d,v19.2d,v13.2d	\n\t"\
		"ldp	x10,x11,[x8,#0x10]	\n\t	fadd	v12.2d,v18.2d,v12.2d	\n\t"/* lcol: [c2,s2]-pointers */\
		"									fadd	v13.2d,v19.2d,v13.2d	\n\t"\
		"ld1r	{v8.2d},[x10]		\n\t	ldp	q16,q17,[x6]				\n\t"\
		"ld1r	{v9.2d},[x11]		\n\t	ldp	q18,q19,[x7]				\n\t"\
		"fmul	v10.2d,v4.2d,v8.2d	\n\t	ldp	x12,x13,[x8,#0x50]	\n\t"/* [c6,s6]-pointers */\
		"fmul	v11.2d,v5.2d,v8.2d	\n\t"\
		"fmla	v10.2d,v5.2d,v9.2d	\n\t	ld1r	{v20.2d},[x12]	\n\t"\
		"fmls	v11.2d,v4.2d,v9.2d	\n\t	ld1r	{v21.2d},[x13]	\n\t"\
		"ldp	x10,x11,[x8,#0x20]	\n\t	fmul	v22.2d,v16.2d,v20.2d	\n\t"/* lcol: [c3,s3]-pointers */\
		"									fmul	v23.2d,v17.2d,v20.2d	\n\t"\
		"ld1r	{v8.2d},[x10]		\n\t	fmla	v22.2d,v17.2d,v21.2d	\n\t"\
		"ld1r	{v9.2d},[x11]		\n\t	fmls	v23.2d,v16.2d,v21.2d	\n\t"\
		"fmul	v4.2d,v6.2d,v8.2d	\n\t	ldp	x12,x13,[x8,#0x60]	\n\t"/* [c7,s7]-pointers */\
		"fmul	v5.2d,v7.2d,v8.2d	\n\t"\
		"fmla	v4.2d,v7.2d,v9.2d	\n\t	ld1r	{v20.2d},[x12]	\n\t"\
		"fmls	v5.2d,v6.2d,v9.2d	\n\t	ld1r	{v21.2d},[x13]	\n\t"\
		"fsub	v6.2d,v10.2d,v4.2d	\n\t	fmul	v16.2d,v18.2d,v20.2d	\n\t"\
		"fsub	v7.2d,v11.2d,v5.2d	\n\t	fmul	v17.2d,v19.2d,v20.2d	\n\t"\
		"fadd	v4.2d,v10.2d,v4.2d	\n\t	fmla	v16.2d,v19.2d,v21.2d	\n\t"\
		"fadd	v5.2d,v11.2d,v5.2d	\n\t	fmls	v17.2d,v18.2d,v21.2d	\n\t"\
										"	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
										"	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
										"	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
										"	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		/* combine to get 2 length-4 output subtransforms... */\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v17.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	fsub	v16.2d,v15.2d,v18.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
	/* v6,7,18,19 free */\
	/* Dec 2020: O-addresses are BRed, i.e. what were formerly pointers [o0-7] = out0 + {0,off[4,2,6,1,5,3,7]}
	are now computed as out0 + {0,off[4,2,4+2, 1,4+1,1+2,4+1+2]} */\
		"ldr	x0,%[out0]			\n\t	ldr	x4,%[o_off]			\n\t"/* out0 is O-base-address; o_off is unit bytewise O-offset, still need multiples 2-7 */\
		"add	x6,x4,x4			\n\t	add	x7,x6,x6			\n\t"/* O-offset multiples 2,4 go in x6,7, which are overwritten via sum-with-base-address last */\
		"add	x4,x0,x4			\n\t"/* x4 = out0+i1; loading separate i1,2,4-offsets allows us to reduce instruction serialization, which can't hurt */\
		"add	x1,x0,x7			\n\t	add	x5,x4,x7			\n\t"/* x[1,5] = out0 + i4,5 */\
		"add	x2,x0,x6			\n\t	add	x6,x4,x6			\n\t"/* x[2,6] = out0 + i2,3 */\
		"add	x3,x2,x7			\n\t	add	x7,x6,x7			\n\t"/* x[3,7] = out0 + i6,7 */\
		"fsub	v6.2d,v0.2d,v12.2d		\n\t	fsub	v18.2d,v14.2d,v15.2d	\n\t"\
		"fsub	v7.2d,v1.2d,v13.2d		\n\t	fadd	v19.2d,v14.2d,v15.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v12.2d		\n\t	fmul	v18.2d,v18.2d,v29.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v13.2d		\n\t	fmul	v19.2d,v19.2d,v29.2d	\n\t"\
		"fsub	v12.2d,v8.2d,v21.2d		\n\t	fadd	v14.2d,v17.2d,v16.2d	\n\t"\
		"fsub	v13.2d,v9.2d,v20.2d		\n\t	fsub	v15.2d,v17.2d,v16.2d	\n\t"\
		"fadd	v8.2d,v8.2d,v21.2d		\n\t	fmul	v14.2d,v14.2d,v29.2d	\n\t"\
		"fadd	v9.2d,v9.2d,v20.2d		\n\t	fmul	v15.2d,v15.2d,v29.2d	\n\t"\
		"fsub	v16.2d,v2.2d ,v18.2d	\n\t	stp	q0 ,q1 ,[x0]	\n\t"\
		"fsub	v17.2d,v3.2d ,v19.2d	\n\t	stp	q6 ,q7 ,[x1]	\n\t"\
		"fadd	v2.2d ,v2.2d ,v18.2d	\n\t	stp	q12,q9 ,[x3]	\n\t"\
		"fadd	v3.2d ,v3.2d ,v19.2d	\n\t	stp	q8 ,q13,[x2]	\n\t"\
		"fsub	v18.2d,v4.2d ,v14.2d	\n\t	stp	q2, q3 ,[x7]	\n\t"\
		"fsub	v19.2d,v5.2d ,v15.2d	\n\t	stp	q16,q17,[x6]	\n\t"\
		"fadd	v4.2d ,v4.2d ,v14.2d	\n\t	stp	q4 ,q19,[x4]	\n\t"\
		"fadd	v5.2d ,v5.2d ,v15.2d	\n\t	stp	q18,q5 ,[x5]	\n\t"\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "m" (Xi1)\
		 ,[out0] "m" (Xout0)\
		 ,[o_off] "m" (Xo_off)/* O-address pointer-stride */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x10","x11","x12","x13",\
		"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v29"	/* Clobbered registers */\
	);\
	}

	// Based on the SSE2_RADIX16_DIT_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-input addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	// We use just a single output base-pointer plus literal ostrides which are [1,2,3,4]-multiples of
	// __01; this allows us to cut GP-register usage, which is absolutely a must for the 32-bit version
	// of the macro, and is a benefit to the 64-bit versions which code-fold to yield 2 side-by-side
	// streams of independently executable instructions, one for data in xmm0-7, the other using xmm8-15.
	#define SSE2_RADIX16_DIT_0TWIDDLE(Xin0,Xoff, Xisrt2,Xtwo, Xout0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
		"ldr	x16,%[in0]	\n\t	ldr	x17,%[off]	\n\t"/* Load input base-address into x16 and int32[16] offset-array pointer into x17 */\
		/* isrt2,cc0,ss0 not needed until pass 2, but use GPR x7 to load three vector-regs now: */\
		"ldr	x7,%[__isrt2]		\n\t	ld1r	{v29.2d},[x7]	\n\t"\
		"ldp	q30,q31,[x7,#0x10]	\n\t"/* cc0,ss0 */\
		"ldp	w8 ,w9 ,[x17]		\n\t"/* Use x8 -27 for I-addresses, x0-15 for O-addresses */\
		"ldp	w10,w11,[x17,#0x08]	\n\t"/* i[0-3] */\
		"add	x8 ,x16,x8 ,lsl #3	\n\t	add	x9 ,x16,x9 ,lsl #3	\n\t"\
		"add	x10,x16,x10,lsl #3	\n\t	add	x11,x16,x11,lsl #3	\n\t"\
		/* SSE2_RADIX4_DIT_0TWIDDLE_B(r1 ):	SSE2_RADIX4_DIT_0TWIDDLE_B(r9 ): */\
		"ldp	q8 ,q9 ,[x8 ]		\n\t	ldp	w12,w13,[x17,#0x10]	\n\t"/* i[4-7] */\
		"ldp	q0 ,q1 ,[x9 ]		\n\t	ldp	w14,w15,[x17,#0x18]	\n\t"\
		"ldp	q10,q11,[x10]		\n\t	add	x12,x16,x12,lsl #3	\n\t	add	x13,x16,x13,lsl #3	\n\t"\
		"ldp	q4 ,q5 ,[x11]		\n\t	add	x14,x16,x14,lsl #3	\n\t	add	x15,x16,x15,lsl #3	\n\t"\
		"fsub	v2.2d ,v8.2d ,v0.2d	\n\t	ldp	q20,q21,[x12]		\n\t"\
		"fsub	v3.2d ,v9.2d ,v1.2d	\n\t	ldp	q12,q13,[x13]		\n\t"\
		"fadd	v0.2d ,v8.2d ,v0.2d	\n\t	ldp	q22,q23,[x14]		\n\t"\
		"fadd	v1.2d ,v9.2d ,v1.2d	\n\t	ldp	q16,q17,[x15]		\n\t"\
		"fsub	v6.2d ,v10.2d,v4.2d	\n\t	fsub	v14.2d,v20.2d,v12.2d	\n\t"\
		"fsub	v7.2d ,v11.2d,v5.2d	\n\t	fsub	v15.2d,v21.2d,v13.2d	\n\t"\
		"fadd	v4.2d ,v10.2d,v4.2d	\n\t	fadd	v12.2d,v20.2d,v12.2d	\n\t"\
		"fadd	v5.2d ,v11.2d,v5.2d	\n\t	fadd	v13.2d,v21.2d,v13.2d	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t	fadd	v17.2d,v13.2d,v17.2d	\n\t"\
		"ldr	x0,%[__out0]		\n\t	ldr	w7,%[__o4]			\n\t"\
		"ldr	w1,%[__o1]			\n\t	add	x1,x0,x1			\n\t"\
		"ldr	w2,%[__o2]			\n\t	add	x2,x0,x2			\n\t"\
		"ldr	w3,%[__o3]			\n\t	add	x3,x0,x3			\n\t"\
/* o0 */"stp	q4 ,q5 ,[x0]	\n\t	add	x4,x0,x7	\n\t	fsub	v22.2d,v14.2d,v19.2d	\n\t"\
/* o1 */"stp	q7 ,q11,[x1]	\n\t	add	x5,x1,x7	\n\t	fsub	v23.2d,v15.2d,v18.2d	\n\t"\
/* o2 */"stp	q8 ,q9 ,[x2]	\n\t	add	x6,x2,x7	\n\t	fadd	v19.2d,v14.2d,v19.2d	\n\t"\
/* o3 */"stp	q10,q6 ,[x3]	\n\t	add	x7,x3,x7	\n\t	fadd	v18.2d,v15.2d,v18.2d	\n\t"\
/*i8-b*/"ldp	w8 ,w9 ,[x17,#0x20]	\n\t									stp	q16,q17,[x4]	\n\t"/* o4 */\
		"ldp	w10,w11,[x17,#0x28]	\n\t									stp	q19,q23,[x5]	\n\t"/* o5 */\
		"add	x8 ,x16,x8 ,lsl #3	\n\t	add	x9 ,x16,x9 ,lsl #3	\n\t	stp	q20,q21,[x6]	\n\t"/* o6 */\
		"add	x10,x16,x10,lsl #3	\n\t	add	x11,x16,x11,lsl #3	\n\t	stp	q22,q18,[x7]	\n\t"/* o7 */\
		/* SSE2_RADIX4_DIT_0TWIDDLE_B(r17):	SSE2_RADIX4_DIT_0TWIDDLE_B(r25): */\
		"ldp	q8 ,q9 ,[x8 ]		\n\t	ldp	w12,w13,[x17,#0x30]	\n\t"/* i[c-f] */\
		"ldp	q0 ,q1 ,[x9 ]		\n\t	ldp	w14,w15,[x17,#0x38]	\n\t"\
		"ldp	q10,q11,[x10]		\n\t	add	x12,x16,x12,lsl #3	\n\t	add	x13,x16,x13,lsl #3	\n\t"\
		"ldp	q4 ,q5 ,[x11]		\n\t	add	x14,x16,x14,lsl #3	\n\t	add	x15,x16,x15,lsl #3	\n\t"\
		"fsub	v2.2d ,v8.2d ,v0.2d	\n\t	ldp	q20,q21,[x12]		\n\t"\
		"fsub	v3.2d ,v9.2d ,v1.2d	\n\t	ldp	q12,q13,[x13]		\n\t"\
		"fadd	v0.2d ,v8.2d ,v0.2d	\n\t	ldp	q22,q23,[x14]		\n\t"\
		"fadd	v1.2d ,v9.2d ,v1.2d	\n\t	ldp	q16,q17,[x15]		\n\t"\
/*** done with Input-address reads ***/\
		"fsub	v6.2d ,v10.2d,v4.2d	\n\t	fsub	v14.2d,v20.2d,v12.2d	\n\t"\
		"fsub	v7.2d ,v11.2d,v5.2d	\n\t	fsub	v15.2d,v21.2d,v13.2d	\n\t"\
		"fadd	v4.2d ,v10.2d,v4.2d	\n\t	fadd	v12.2d,v20.2d,v12.2d	\n\t"\
		"fadd	v5.2d ,v11.2d,v5.2d	\n\t	fadd	v13.2d,v21.2d,v13.2d	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t	fadd	v17.2d,v13.2d,v17.2d	\n\t"\
		"ldr	w15,%[__o4]			\n\t"\
/* o8 */"add	x8 ,x4,x15	\n\t	stp	q4 ,q5 ,[x8 ]	\n\t"\
/* o9 */"add	x9 ,x5,x15	\n\t	stp	q7 ,q11,[x9 ]	\n\t"\
/* oa */"add	x10,x6,x15	\n\t	stp	q8 ,q9 ,[x10]	\n\t"\
/* ob */"add	x11,x7,x15	\n\t	stp	q10,q6 ,[x11]	\n\t"\
					/* rcol: Don't actually store to oc-of yet, just compute addresses: */\
		"		fsub	v22.2d,v14.2d,v19.2d	\n\t	add	x12,x8 ,x15	\n\t"/* oc */\
		"		fsub	v23.2d,v15.2d,v18.2d	\n\t	add	x13,x9 ,x15	\n\t"/* od */\
		"		fadd	v19.2d,v14.2d,v19.2d	\n\t	add	x14,x10,x15	\n\t"/* oe */\
		"		fadd	v18.2d,v15.2d,v18.2d	\n\t	add	x15,x11,x15	\n\t"/* of */\
		/* v20,21 used in next rcol 4-DFT, instead of storing, do reg-copy 20->16,21->30,
		But must carefully arrange order of STP, LDP and MOV here to ensure each lcol datum
		is safely stored before register overwritten by rcol LDP or reg-copy MOV: */\
		"stp	q16,q17,[x12]		\n\t	mov	v28.16b,v21.16b			\n\t"/* 16,28 have 6,7 */\
		"mov	v16.16b,v20.16b		\n\t	ldp	q20,q21,[x2 ]			\n\t"/* 20,21 have 0,1 */\
		"stp	q19,q23,[x13]		\n\t	ldp	q12,q13,[x6 ]			\n\t"/* 12,13 have 2,3 */\
		"stp	q22,q18,[x15]		\n\t	ldp	q22,q23,[x10]			\n\t"/* 22,23 have 4,5 */\
	/*** Now do four pass-2 4-DFTs, inputs from local store, outputs back to same: ***/\
		/* Block 0:							Block 2 (loads for same above): */\
		"ldp	q8 ,q9 ,[x0 ]		\n\t	fadd	v18.2d,v23.2d,v22.2d	\n\t"\
		"ldp	q0 ,q1 ,[x4 ]		\n\t	fsub	v19.2d,v23.2d,v22.2d	\n\t"\
		"ldp	q10,q11,[x8 ]		\n\t	fadd	v17.2d,v16.2d,v28.2d	\n\t"\
		"ldp	q4 ,q5 ,[x12]		\n\t	fsub	v16.2d,v16.2d,v28.2d	\n\t"\
		"fsub	v2.2d ,v8.2d ,v0.2d	\n\t	fmul	v18.2d,v29.2d,v18.2d	\n\t"\
		"fsub	v3.2d ,v9.2d ,v1.2d	\n\t	fmul	v19.2d,v29.2d,v19.2d	\n\t"\
		"fadd	v0.2d ,v8.2d ,v0.2d	\n\t	fmul	v16.2d,v29.2d,v16.2d	\n\t"\
		"fadd	v1.2d ,v9.2d ,v1.2d	\n\t	fmul	v17.2d,v29.2d,v17.2d	\n\t"\
		"fsub	v6.2d ,v10.2d,v4.2d	\n\t	fsub	v22.2d,v18.2d,v16.2d	\n\t"\
		"fsub	v7.2d ,v11.2d,v5.2d	\n\t	fadd	v23.2d,v18.2d,v16.2d	\n\t"\
		"fadd	v4.2d ,v10.2d,v4.2d	\n\t	fsub	v16.2d,v19.2d,v17.2d	\n\t"\
		"fadd	v5.2d ,v11.2d,v5.2d	\n\t	fadd	v17.2d,v19.2d,v17.2d	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	fsub	v18.2d,v20.2d,v13.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	fadd	v19.2d,v20.2d,v13.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	fsub	v20.2d,v21.2d,v12.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t	fadd	v21.2d,v21.2d,v12.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	fsub	v12.2d,v18.2d,v17.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t	fadd	v13.2d,v18.2d,v17.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	fsub	v18.2d,v19.2d,v22.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t	fadd	v19.2d,v19.2d,v22.2d	\n\t"\
		"stp	q4 ,q5 ,[x0 ]		\n\t	fsub	v14.2d,v20.2d,v16.2d	\n\t"\
		"stp	q7 ,q11,[x4 ]		\n\t	fadd	v15.2d,v20.2d,v16.2d	\n\t"\
		"stp	q8 ,q9 ,[x8 ]		\n\t	fsub	v20.2d,v21.2d,v23.2d	\n\t"\
		"stp	q10,q6 ,[x12]		\n\t	fadd	v21.2d,v21.2d,v23.2d	\n\t"\
		"ldp	q10,q11,[x1 ]		\n\t	stp	q19,q15,[x2 ]		\n\t"\
		"ldp	q2 ,q3 ,[x5 ]		\n\t	stp	q13,q20,[x6 ]		\n\t"\
		"ldp	q6 ,q7 ,[x9 ]		\n\t	stp	q18,q14,[x10 ]		\n\t"\
		"ldp	q8 ,q9 ,[x13]		\n\t	stp	q12,q21,[x14]		\n\t"\
		/* Block 1:							Block 3: */\
		"fmul	v0.2d,v8.2d,v31.2d	\n\t	ldp	q22,q23,[x3 ]		\n\t"\
		"fmul	v1.2d,v9.2d,v31.2d	\n\t	ldp	q14,q15,[x7 ]		\n\t"\
		"fmla	v0.2d,v9.2d,v30.2d	\n\t	ldp	q18,q19,[x11]		\n\t"\
		"fmls	v1.2d,v8.2d,v30.2d	\n\t	ldp	q20,q21,[x15]		\n\t"\
		"fmul	v8.2d,v6.2d,v30.2d	\n\t	fmul	v12.2d,v20.2d,v30.2d	\n\t"\
		"fmul	v9.2d,v7.2d,v30.2d	\n\t	fmul	v13.2d,v21.2d,v30.2d	\n\t"\
		"fmla	v8.2d,v7.2d,v31.2d	\n\t	fmla	v12.2d,v21.2d,v31.2d	\n\t"\
		"fmls	v9.2d,v6.2d,v31.2d	\n\t	fmls	v13.2d,v20.2d,v31.2d	\n\t"\
		"fadd	v4.2d,v8.2d,v0.2d	\n\t	fmul	v20.2d,v18.2d,v31.2d	\n\t"\
		"fadd	v5.2d,v9.2d,v1.2d	\n\t	fmul	v21.2d,v19.2d,v31.2d	\n\t"\
		"fsub	v6.2d,v8.2d,v0.2d	\n\t	fmla	v20.2d,v19.2d,v30.2d	\n\t"\
		"fsub	v7.2d,v9.2d,v1.2d	\n\t	fmls	v21.2d,v18.2d,v30.2d	\n\t"\
		"fadd	v8.2d,v2.2d,v3.2d	\n\t	fadd	v16.2d,v20.2d,v12.2d	\n\t"\
		"fsub	v9.2d,v3.2d,v2.2d	\n\t	fadd	v17.2d,v21.2d,v13.2d	\n\t"\
		"fmul	v8.2d,v8.2d,v29.2d	\n\t	fsub	v18.2d,v20.2d,v12.2d	\n\t"\
		"fmul	v9.2d,v9.2d,v29.2d	\n\t	fsub	v19.2d,v21.2d,v13.2d	\n\t"\
		"fsub	v0.2d,v10.2d,v8.2d	\n\t	fsub	v20.2d,v14.2d,v15.2d	\n\t"\
		"fsub	v1.2d,v11.2d,v9.2d	\n\t	fadd	v21.2d,v15.2d,v14.2d	\n\t"\
		"fadd	v2.2d,v10.2d,v8.2d	\n\t	fmul	v20.2d,v20.2d,v29.2d	\n\t"\
		"fadd	v3.2d,v11.2d,v9.2d	\n\t	fmul	v21.2d,v21.2d,v29.2d	\n\t"\
		"fadd	v8.2d,v2.2d,v4.2d	\n\t	fsub	v12.2d,v22.2d,v20.2d	\n\t"\
		"fadd	v9.2d,v3.2d,v5.2d	\n\t	fsub	v13.2d,v23.2d,v21.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v4.2d	\n\t	fadd	v14.2d,v22.2d,v20.2d	\n\t"\
		"fsub	v3.2d,v3.2d,v5.2d	\n\t	fadd	v15.2d,v23.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v0.2d,v7.2d	\n\t	fadd	v20.2d,v12.2d,v18.2d	\n\t"\
		"fadd	v5.2d,v1.2d,v6.2d	\n\t	fadd	v21.2d,v13.2d,v19.2d	\n\t"\
		"fsub	v0.2d,v0.2d,v7.2d	\n\t	fsub	v12.2d,v12.2d,v18.2d	\n\t"\
		"fsub	v1.2d,v1.2d,v6.2d	\n\t	fsub	v13.2d,v13.2d,v19.2d	\n\t"\
		"stp	q8 ,q9 ,[x1 ]		\n\t	fadd	v18.2d,v14.2d,v17.2d	\n\t"\
		"stp	q2 ,q3 ,[x9 ]		\n\t	fadd	v19.2d,v15.2d,v16.2d	\n\t"\
		"stp	q4 ,q1 ,[x5 ]		\n\t	fsub	v14.2d,v14.2d,v17.2d	\n\t"\
		"stp	q0 ,q5 ,[x13]		\n\t	fsub	v15.2d,v15.2d,v16.2d	\n\t"\
		"									stp	q20,q21,[x3 ]		\n\t"\
		"									stp	q12,q13,[x11]		\n\t"\
		"									stp	q18,q15,[x7 ]		\n\t"\
		"									stp	q14,q19,[x15]		\n\t"\
		:					/* outputs: none */\
		:[in0] "m" (Xin0)	/* Input-address-16-tet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 16 double* index offsets */\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		,[__o1] "m" (Xo1)\
		,[__o2] "m" (Xo2)\
		,[__o3] "m" (Xo3)\
		,[__o4] "m" (Xo4)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15","x16","x17",\
			"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15",\
			"v16","v17","v18","v19","v20","v21","v22","v23", "v28","v29","v30","v31"	/* Clobbered registers - x18,x29 are reserved on Apple platforms */\
	);\
	}

	/* Dec 2020: Needed to cut #args for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid
	multiple versions of the macro having different arglists. Replace 16 O-addresses with O-base-address
	out0 and pointer to array of 16 int offset-indices: */
	#define SSE2_RADIX16_DIF_0TWIDDLE(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0,Xoff)\
	{\
	__asm__ volatile (\
		/* isrt2,cc0,ss0 not needed until pass 2, but use GPR x7 to load three vector-regs now: */\
		"ldr	x7,%[__isrt2]		\n\t	ld1r	{v29.2d},[x7]	\n\t"\
		"ldp	q30,q31,[x7,#0x10]	\n\t"/* cc0,ss0 */\
		/* SSE2_RADIX4_DIF_IN_PLACE(r1,17,9,25)		(r5,21,13,29): */\
		"ldr	x0,%[__in0]			\n\t	ldr	w14,%[__i2]			\n\t"\
		"ldr	w12,%[__i4]			\n\t	ldr	w15,%[__i1]			\n\t"\
		"add	x4 ,x0,x12			\n\t"\
		"add	x8 ,x4,x12			\n\t"\
		"add	x12,x8,x12			\n\t"\
		"ldp	q8 ,q9 ,[x0 ]		\n\t	add	x2 ,x0 ,x14			\n\t"\
		"ldp	q0 ,q1 ,[x8 ]		\n\t	add	x6 ,x4 ,x14			\n\t"\
		"ldp	q10,q11,[x4 ]		\n\t	add	x10,x8 ,x14			\n\t"\
		"ldp	q4 ,q5 ,[x12]		\n\t	add	x14,x12,x14			\n\t"\
		"fsub	v2.2d ,v8.2d ,v0.2d	\n\t	ldp	q20,q21,[x2 ]		\n\t"\
		"fsub	v3.2d ,v9.2d ,v1.2d	\n\t	ldp	q12,q13,[x10]		\n\t"\
		"fadd	v0.2d ,v8.2d ,v0.2d	\n\t	ldp	q22,q23,[x6 ]		\n\t"\
		"fadd	v1.2d ,v9.2d ,v1.2d	\n\t	ldp	q16,q17,[x14]		\n\t"\
		"fsub	v6.2d ,v10.2d,v4.2d	\n\t	fsub	v14.2d,v20.2d,v12.2d	\n\t"\
		"fsub	v7.2d ,v11.2d,v5.2d	\n\t	fsub	v15.2d,v21.2d,v13.2d	\n\t"\
		"fadd	v4.2d ,v10.2d,v4.2d	\n\t	fadd	v12.2d,v20.2d,v12.2d	\n\t"\
		"fadd	v5.2d ,v11.2d,v5.2d	\n\t	fadd	v13.2d,v21.2d,v13.2d	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t	fadd	v17.2d,v13.2d,v17.2d	\n\t"\
		"stp	q4 ,q5 ,[x0 ]	\n\t	add	x1 ,x0 ,x15	\n\t	fsub	v22.2d,v14.2d,v19.2d	\n\t"\
		"stp	q10,q6 ,[x4 ]	\n\t	add	x5 ,x4 ,x15	\n\t	fsub	v23.2d,v15.2d,v18.2d	\n\t"\
		"stp	q8 ,q9 ,[x8 ]	\n\t	add	x9 ,x8 ,x15	\n\t	fadd	v19.2d,v14.2d,v19.2d	\n\t"\
		"stp	q7 ,q11,[x12]	\n\t	add	x13,x12,x15	\n\t	fadd	v18.2d,v15.2d,v18.2d	\n\t"\
		"ldp	q8 ,q9 ,[x1 ]	\n\t	add	x3 ,x2 ,x15	\n\t	stp	q16,q17,[x2 ]		\n\t"\
		"ldp	q0 ,q1 ,[x9 ]	\n\t	add	x7 ,x6 ,x15	\n\t	stp	q22,q18,[x6 ]		\n\t"\
		"ldp	q10,q11,[x5 ]	\n\t	add	x11,x10,x15	\n\t	stp	q20,q21,[x10]		\n\t"\
		"ldp	q4 ,q5 ,[x13]	\n\t	add	x15,x14,x15	\n\t	stp	q19,q23,[x14]		\n\t"\
		/* SSE2_RADIX4_DIF_IN_PLACE(r3,19,11,27)		(r7,23,15,31): */\
		"fsub	v2.2d ,v8.2d ,v0.2d	\n\t	ldp	q20,q21,[x3 ]		\n\t"\
		"fsub	v3.2d ,v9.2d ,v1.2d	\n\t	ldp	q12,q13,[x11]		\n\t"\
		"fadd	v0.2d ,v8.2d ,v0.2d	\n\t	ldp	q22,q23,[x7 ]		\n\t"\
		"fadd	v1.2d ,v9.2d ,v1.2d	\n\t	ldp	q16,q17,[x15]		\n\t"\
		"fsub	v6.2d ,v10.2d,v4.2d	\n\t	fsub	v14.2d,v20.2d,v12.2d	\n\t"\
		"fsub	v7.2d ,v11.2d,v5.2d	\n\t	fsub	v15.2d,v21.2d,v13.2d	\n\t"\
		"fadd	v4.2d ,v10.2d,v4.2d	\n\t	fadd	v12.2d,v20.2d,v12.2d	\n\t"\
		"fadd	v5.2d ,v11.2d,v5.2d	\n\t	fadd	v13.2d,v21.2d,v13.2d	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t	fadd	v17.2d,v13.2d,v17.2d	\n\t"\
		"stp	q4 ,q5 ,[x1 ]		\n\t	fsub	v22.2d,v14.2d,v19.2d	\n\t"\
		"stp	q10,q6 ,[x5 ]		\n\t	fsub	v23.2d,v15.2d,v18.2d	\n\t"\
		"stp	q8 ,q9 ,[x9 ]		\n\t	fadd	v19.2d,v14.2d,v19.2d	\n\t"\
		"stp	q7 ,q11,[x13]		\n\t	fadd	v18.2d,v15.2d,v18.2d	\n\t"\
		/* v20,21 used in next rcol 4-DFT, instead of storing, do reg-copy 20->14,21->15,
		But must carefully arrange order of STP, LDP and MOV here to ensure each lcol datum
		is safely stored before register overwritten by rcol LDP or reg-copy MOV: */\
		"stp	q16,q17,[x3 ]		\n\t	ldp	q24,q25,[x10]	\n\t"/* 2,3 */\
		"stp	q22,q18,[x7 ]		\n\t	ldp	q12,q13,[x9 ]	\n\t"/* 4,5 */\
		"mov	v14.16b,v20.16b		\n\t	mov	v15.16b,v21.16b	\n\t"/* 6,7 */\
		"stp	q19,q23,[x15]		\n\t	ldp	q22,q23,[x8 ]	\n\t"/* 0,1 - load this pair last because need to wait for lcol writes of q22,23 to issue */\
	/*** Now do four pass-2 4-DFTs, inputs from local store, outputs back to main-array: ***/\
	/* Use GPRs x0-15 with O-addresses after data-reload from each sub-quartet of those I-addresses finishes */\
		/* Load output base-address into x16 and offset-array pointer into x17: */\
		"ldr	x16,%[out0]			\n\t	ldr	x17,%[off]			\n\t"\
		/* Block 0:							Block 2 (loads for same above): */\
		"ldp	q8 ,q9 ,[x0 ]		\n\t	fsub	v16.2d,v12.2d,v13.2d	\n\t"\
		"ldp	q2 ,q3 ,[x2 ]		\n\t	fadd	v17.2d,v12.2d,v13.2d	\n\t"\
		"ldp	q10,q11,[x1 ]		\n\t	fadd	v18.2d,v15.2d,v14.2d	\n\t"\
		"ldp	q6 ,q7 ,[x3 ]		\n\t	fsub	v19.2d,v15.2d,v14.2d	\n\t"\
		"fsub	v0.2d ,v8.2d ,v2.2d	\n\t	fmul	v20.2d,v29.2d,v16.2d	\n\t"\
		"fsub	v1.2d ,v9.2d ,v3.2d	\n\t	fmul	v21.2d,v29.2d,v17.2d	\n\t"\
		"fadd	v2.2d ,v8.2d ,v2.2d	\n\t	fmul	v18.2d,v29.2d,v18.2d	\n\t"\
		"fadd	v3.2d ,v9.2d ,v3.2d	\n\t	fmul	v19.2d,v29.2d,v19.2d	\n\t"\
		"fsub	v4.2d ,v10.2d,v6.2d	\n\t	fsub	v12.2d,v22.2d,v25.2d	\n\t	ldp	w0,w1,[x17]			\n\t"/* off[0|1] */\
		"fsub	v5.2d ,v11.2d,v7.2d	\n\t	fsub	v13.2d,v23.2d,v24.2d	\n\t	ldp	w2,w3,[x17,#0x08]	\n\t"/* off[2|3] */\
		"fadd	v6.2d ,v10.2d,v6.2d	\n\t	fadd	v14.2d,v23.2d,v24.2d	\n\t	add	x0,x16,x0,lsl #3	\n\t"/* out0 + off[0] */\
		"fadd	v7.2d ,v11.2d,v7.2d	\n\t	fadd	v15.2d,v22.2d,v25.2d	\n\t	add	x1,x16,x1,lsl #3	\n\t"/* out0 + off[1] */\
		"fsub	v8.2d ,v2.2d,v6.2d	\n\t	fsub	v16.2d,v20.2d,v18.2d	\n\t	add	x2,x16,x2,lsl #3	\n\t"/* out0 + off[2] */\
		"fsub	v9.2d ,v3.2d,v7.2d	\n\t	fsub	v17.2d,v21.2d,v19.2d	\n\t	add	x3,x16,x3,lsl #3	\n\t"/* out0 + off[3] */\
		"fadd	v6.2d ,v2.2d,v6.2d	\n\t	fadd	v18.2d,v20.2d,v18.2d	\n\t	ldp	w8 ,w9 ,[x17,#0x10]	\n\t"/* off[4|5] */\
		"fadd	v7.2d ,v3.2d,v7.2d	\n\t	fadd	v19.2d,v21.2d,v19.2d	\n\t	ldp	w10,w11,[x17,#0x18]	\n\t"/* off[6|7] */\
		"fsub	v10.2d,v0.2d,v5.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t	add	x8 ,x16,x8 ,lsl #3	\n\t"/* out0 + off[4] */\
		"fsub	v11.2d,v1.2d,v4.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t	add	x9 ,x16,x9 ,lsl #3	\n\t"/* out0 + off[5] */\
		"fadd	v5.2d ,v0.2d,v5.2d	\n\t	fsub	v21.2d,v14.2d,v17.2d	\n\t	add	x10,x16,x10,lsl #3	\n\t"/* out0 + off[6] */\
		"fadd	v4.2d ,v1.2d,v4.2d	\n\t	fadd	v17.2d,v14.2d,v17.2d	\n\t	add	x11,x16,x11,lsl #3	\n\t"/* out0 + off[7] */\
		"stp	q6 ,q7 ,[x0]		\n\t	fsub	v22.2d,v13.2d,v18.2d	\n\t"\
		"stp	q8 ,q9 ,[x1]		\n\t	fadd	v18.2d,v13.2d,v18.2d	\n\t"\
		"stp	q10,q4 ,[x2]		\n\t	fsub	v23.2d,v15.2d,v19.2d	\n\t"\
		"stp	q5 ,q11,[x3]		\n\t	fadd	v19.2d,v15.2d,v19.2d	\n\t"\
		/* Block 1:							Block 3: */\
		"ldp	q0 ,q1 ,[x4]		\n\t	stp	q16,q17,[x8 ]		\n\t"\
		"ldp	q2 ,q3 ,[x6]		\n\t	stp	q20,q21,[x9 ]		\n\t"\
		"ldp	q4 ,q5 ,[x5]		\n\t	stp	q23,q18,[x10]		\n\t"\
		"ldp	q10,q11,[x7]		\n\t	stp	q19,q22,[x11]		\n\t"\
		"fmul	v8.2d,v4.2d,v30.2d	\n\t	ldp	q12,q13,[x12]			\n\t"\
		"fmul	v9.2d,v5.2d,v30.2d	\n\t	ldp	q14,q15,[x14]			\n\t"\
		"fmls	v8.2d,v5.2d,v31.2d	\n\t	ldp	q16,q17,[x13]			\n\t"\
		"fmla	v9.2d,v4.2d,v31.2d	\n\t	ldp	q18,q19,[x15]			\n\t"\
		"fmul	v6.2d,v10.2d,v31.2d	\n\t	fmul	v20.2d,v16.2d,v31.2d \n\t"\
		"fmul	v7.2d,v11.2d,v31.2d	\n\t	fmul	v21.2d,v17.2d,v31.2d \n\t"\
		"fmls	v6.2d,v11.2d,v30.2d	\n\t	fmls	v20.2d,v17.2d,v30.2d \n\t"\
		"fmla	v7.2d,v10.2d,v30.2d	\n\t	fmla	v21.2d,v16.2d,v30.2d \n\t"\
		"fsub	v4.2d,v8.2d,v6.2d	\n\t	fmul	v22.2d,v18.2d,v30.2d \n\t"\
		"fsub	v5.2d,v9.2d,v7.2d	\n\t	fmul	v23.2d,v19.2d,v30.2d \n\t"\
		"fadd	v6.2d,v8.2d,v6.2d	\n\t	fmls	v22.2d,v19.2d,v31.2d \n\t"\
		"fadd	v7.2d,v9.2d,v7.2d	\n\t	fmla	v23.2d,v18.2d,v31.2d \n\t"\
		"fsub	v8.2d,v2.2d,v3.2d	\n\t	fsub	v16.2d,v20.2d,v22.2d \n\t	ldp	w0,w1,[x17,#0x20]	\n\t"/* off[8|9] */\
		"fadd	v9.2d,v2.2d,v3.2d	\n\t	fsub	v17.2d,v21.2d,v23.2d \n\t	ldp	w2,w3,[x17,#0x28]	\n\t"/* off[a|b] */\
		"fmul	v8.2d,v8.2d,v29.2d	\n\t	fadd	v18.2d,v20.2d,v22.2d \n\t	add	x0,x16,x0,lsl #3	\n\t"/* out0 + off[8] */\
		"fmul	v9.2d,v9.2d,v29.2d	\n\t	fadd	v19.2d,v21.2d,v23.2d \n\t	add	x1,x16,x1,lsl #3	\n\t"/* out0 + off[9] */\
		"fadd	v2.2d,v0.2d,v8.2d	\n\t	fadd	v20.2d,v15.2d,v14.2d \n\t	add	x2,x16,x2,lsl #3	\n\t"/* out0 + off[a] */\
		"fadd	v3.2d,v1.2d,v9.2d	\n\t	fsub	v21.2d,v15.2d,v14.2d \n\t	add	x3,x16,x3,lsl #3	\n\t"/* out0 + off[b] */\
		"fsub	v0.2d,v0.2d,v8.2d	\n\t	fmul	v20.2d,v20.2d,v29.2d \n\t	ldp	w8 ,w9 ,[x17,#0x30]	\n\t"/* off[c|d] */\
		"fsub	v1.2d,v1.2d,v9.2d	\n\t	fmul	v21.2d,v21.2d,v29.2d \n\t	ldp	w10,w11,[x17,#0x38]	\n\t"/* off[e|f] */\
		"fadd	v8.2d,v2.2d,v6.2d	\n\t	fadd	v14.2d,v12.2d,v20.2d \n\t	add	x8 ,x16,x8 ,lsl #3	\n\t"/* out0 + off[c] */\
		"fadd	v9.2d,v3.2d,v7.2d	\n\t	fadd	v15.2d,v13.2d,v21.2d \n\t	add	x9 ,x16,x9 ,lsl #3	\n\t"/* out0 + off[d] */\
		"fsub	v2.2d,v2.2d,v6.2d	\n\t	fsub	v12.2d,v12.2d,v20.2d \n\t	add	x10,x16,x10,lsl #3	\n\t"/* out0 + off[e] */\
		"fsub	v3.2d,v3.2d,v7.2d	\n\t	fsub	v13.2d,v13.2d,v21.2d \n\t	add	x11,x16,x11,lsl #3	\n\t"/* out0 + off[f] */\
		"fadd	v10.2d,v0.2d,v5.2d	\n\t	fadd	v20.2d,v12.2d,v16.2d \n\t"\
		"fadd	v11.2d,v1.2d,v4.2d	\n\t	fadd	v21.2d,v13.2d,v17.2d \n\t"\
		"fsub	v0.2d,v0.2d,v5.2d	\n\t	fsub	v12.2d,v12.2d,v16.2d \n\t"\
		"fsub	v1.2d,v1.2d,v4.2d	\n\t	fsub	v13.2d,v13.2d,v17.2d \n\t"\
		"stp	q8 ,q9 ,[x0]		\n\t	fadd	v22.2d,v14.2d,v19.2d \n\t"\
		"stp	q2 ,q3 ,[x1]		\n\t	fadd	v23.2d,v15.2d,v18.2d \n\t"\
		"stp	q0 ,q11,[x2]		\n\t	fsub	v14.2d,v14.2d,v19.2d \n\t"\
		"stp	q10,q1 ,[x3]		\n\t	fsub	v15.2d,v15.2d,v18.2d \n\t"\
		"									stp	q20,q21,[x8 ]	\n\t"\
		"									stp	q12,q13,[x9 ]	\n\t"\
		"									stp	q14,q23,[x10]	\n\t"\
		"									stp	q22,q15,[x11]	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "m" (Xi1)\
		,[__i2] "m" (Xi2)\
		,[__i3] "m" (Xi3)\
		,[__i4] "m" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15","x16","x17",\
			"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15",\
			"v16","v17","v18","v19","v20","v21","v22","v23","v24","v25", "v29","v30","v31"	/* Clobbered registers */\
	);\
	}

	// Same as above, but with specifiable I-addresses and regularly spaced O-addresses:
	#define SSE2_RADIX16_DIF_0TWIDDLE_B(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0)\
	{\
	__asm__ volatile (\
		/* isrt2,cc0,ss0 not needed until pass 2, but use GPR x7 to load three vector-regs now: */\
		"ldr	x7,%[__isrt2]		\n\t	ld1r	{v29.2d},[x7]	\n\t"\
		"ldp	q30,q31,[x7,#0x10]	\n\t"/* cc0,ss0 */\
		/* SSE2_RADIX4_DIF_IN_PLACE(r1,17,9,25)		(r5,21,13,29): */\
		"ldr	x0,%[__in0]			\n\t	ldr	w14,%[__i2]			\n\t"\
		"ldr	w12,%[__i4]			\n\t	ldr	w15,%[__i1]			\n\t"\
		"add	x4 ,x0,x12			\n\t"\
		"add	x8 ,x4,x12			\n\t"\
		"add	x12,x8,x12			\n\t"\
		"ldp	q8 ,q9 ,[x0 ]		\n\t	add	x2 ,x0 ,x14			\n\t"\
		"ldp	q0 ,q1 ,[x8 ]		\n\t	add	x6 ,x4 ,x14			\n\t"\
		"ldp	q10,q11,[x4 ]		\n\t	add	x10,x8 ,x14			\n\t"\
		"ldp	q4 ,q5 ,[x12]		\n\t	add	x14,x12,x14			\n\t"\
		"fsub	v2.2d ,v8.2d ,v0.2d	\n\t	ldp	q20,q21,[x2 ]		\n\t"\
		"fsub	v3.2d ,v9.2d ,v1.2d	\n\t	ldp	q12,q13,[x10]		\n\t"\
		"fadd	v0.2d ,v8.2d ,v0.2d	\n\t	ldp	q22,q23,[x6 ]		\n\t"\
		"fadd	v1.2d ,v9.2d ,v1.2d	\n\t	ldp	q16,q17,[x14]		\n\t"\
		"fsub	v6.2d ,v10.2d,v4.2d	\n\t	fsub	v14.2d,v20.2d,v12.2d	\n\t"\
		"fsub	v7.2d ,v11.2d,v5.2d	\n\t	fsub	v15.2d,v21.2d,v13.2d	\n\t"\
		"fadd	v4.2d ,v10.2d,v4.2d	\n\t	fadd	v12.2d,v20.2d,v12.2d	\n\t"\
		"fadd	v5.2d ,v11.2d,v5.2d	\n\t	fadd	v13.2d,v21.2d,v13.2d	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t	fadd	v17.2d,v13.2d,v17.2d	\n\t"\
		"stp	q4 ,q5 ,[x0 ]	\n\t	add	x1 ,x0 ,x15	\n\t	fsub	v22.2d,v14.2d,v19.2d	\n\t"\
		"stp	q10,q6 ,[x4 ]	\n\t	add	x5 ,x4 ,x15	\n\t	fsub	v23.2d,v15.2d,v18.2d	\n\t"\
		"stp	q8 ,q9 ,[x8 ]	\n\t	add	x9 ,x8 ,x15	\n\t	fadd	v19.2d,v14.2d,v19.2d	\n\t"\
		"stp	q7 ,q11,[x12]	\n\t	add	x13,x12,x15	\n\t	fadd	v18.2d,v15.2d,v18.2d	\n\t"\
		"ldp	q8 ,q9 ,[x1 ]	\n\t	add	x3 ,x2 ,x15	\n\t	stp	q16,q17,[x2 ]		\n\t"\
		"ldp	q0 ,q1 ,[x9 ]	\n\t	add	x7 ,x6 ,x15	\n\t	stp	q22,q18,[x6 ]		\n\t"\
		"ldp	q10,q11,[x5 ]	\n\t	add	x11,x10,x15	\n\t	stp	q20,q21,[x10]		\n\t"\
		"ldp	q4 ,q5 ,[x13]	\n\t	add	x15,x14,x15	\n\t	stp	q19,q23,[x14]		\n\t"\
		/* SSE2_RADIX4_DIF_IN_PLACE(r3,19,11,27)		(r7,23,15,31): */\
		"fsub	v2.2d ,v8.2d ,v0.2d	\n\t	ldp	q20,q21,[x3 ]		\n\t"\
		"fsub	v3.2d ,v9.2d ,v1.2d	\n\t	ldp	q12,q13,[x11]		\n\t"\
		"fadd	v0.2d ,v8.2d ,v0.2d	\n\t	ldp	q22,q23,[x7 ]		\n\t"\
		"fadd	v1.2d ,v9.2d ,v1.2d	\n\t	ldp	q16,q17,[x15]		\n\t"\
		"fsub	v6.2d ,v10.2d,v4.2d	\n\t	fsub	v14.2d,v20.2d,v12.2d	\n\t"\
		"fsub	v7.2d ,v11.2d,v5.2d	\n\t	fsub	v15.2d,v21.2d,v13.2d	\n\t"\
		"fadd	v4.2d ,v10.2d,v4.2d	\n\t	fadd	v12.2d,v20.2d,v12.2d	\n\t"\
		"fadd	v5.2d ,v11.2d,v5.2d	\n\t	fadd	v13.2d,v21.2d,v13.2d	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t	fadd	v17.2d,v13.2d,v17.2d	\n\t"\
		"stp	q4 ,q5 ,[x1 ]		\n\t	fsub	v22.2d,v14.2d,v19.2d	\n\t"\
		"stp	q10,q6 ,[x5 ]		\n\t	fsub	v23.2d,v15.2d,v18.2d	\n\t"\
		"stp	q8 ,q9 ,[x9 ]		\n\t	fadd	v19.2d,v14.2d,v19.2d	\n\t"\
		"stp	q7 ,q11,[x13]		\n\t	fadd	v18.2d,v15.2d,v18.2d	\n\t"\
		/* v20,21 used in next rcol 4-DFT, instead of storing, do reg-copy 20->14,21->15,
		But must carefully arrange order of STP, LDP and MOV here to ensure each lcol datum
		is safely stored before register overwritten by rcol LDP or reg-copy MOV: */\
		"stp	q16,q17,[x3 ]		\n\t	ldp	q24,q25,[x10]	\n\t"/* 2,3 */\
		"stp	q22,q18,[x7 ]		\n\t	ldp	q12,q13,[x9 ]	\n\t"/* 4,5 */\
		"mov	v14.16b,v20.16b		\n\t	mov	v15.16b,v21.16b	\n\t"/* 6,7 */\
		"stp	q19,q23,[x15]		\n\t	ldp	q22,q23,[x8 ]	\n\t"/* 0,1 - load this pair last because need to wait for lcol writes of q22,23 to issue */\
	/*** Now do four pass-2 4-DFTs, inputs from local store, outputs back to main-array: ***/\
		"ldr	x16,%[__out0]		\n\t"\
		/* Block 0:							Block 2 (loads for same above): */\
		"ldp	q8 ,q9 ,[x0 ]		\n\t	fsub	v16.2d,v12.2d,v13.2d	\n\t"\
		"ldp	q2 ,q3 ,[x2 ]		\n\t	fadd	v17.2d,v12.2d,v13.2d	\n\t"\
		"ldp	q10,q11,[x1 ]		\n\t	fadd	v18.2d,v15.2d,v14.2d	\n\t"\
		"ldp	q6 ,q7 ,[x3 ]		\n\t	fsub	v19.2d,v15.2d,v14.2d	\n\t"\
		"fsub	v0.2d ,v8.2d ,v2.2d	\n\t	fmul	v20.2d,v29.2d,v16.2d	\n\t"\
		"fsub	v1.2d ,v9.2d ,v3.2d	\n\t	fmul	v21.2d,v29.2d,v17.2d	\n\t"\
		"fadd	v2.2d ,v8.2d ,v2.2d	\n\t	fmul	v18.2d,v29.2d,v18.2d	\n\t"\
		"fadd	v3.2d ,v9.2d ,v3.2d	\n\t	fmul	v19.2d,v29.2d,v19.2d	\n\t"\
		"fsub	v4.2d ,v10.2d,v6.2d	\n\t	fsub	v12.2d,v22.2d,v25.2d	\n\t"\
		"fsub	v5.2d ,v11.2d,v7.2d	\n\t	fsub	v13.2d,v23.2d,v24.2d	\n\t"\
		"fadd	v6.2d ,v10.2d,v6.2d	\n\t	fadd	v14.2d,v23.2d,v24.2d	\n\t"\
		"fadd	v7.2d ,v11.2d,v7.2d	\n\t	fadd	v15.2d,v22.2d,v25.2d	\n\t"\
		"fsub	v8.2d ,v2.2d,v6.2d	\n\t	fsub	v16.2d,v20.2d,v18.2d	\n\t"\
		"fsub	v9.2d ,v3.2d,v7.2d	\n\t	fsub	v17.2d,v21.2d,v19.2d	\n\t"\
		"fadd	v6.2d ,v2.2d,v6.2d	\n\t	fadd	v18.2d,v20.2d,v18.2d	\n\t"\
		"fadd	v7.2d ,v3.2d,v7.2d	\n\t	fadd	v19.2d,v21.2d,v19.2d	\n\t"\
		"fsub	v10.2d,v0.2d,v5.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v11.2d,v1.2d,v4.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v5.2d ,v0.2d,v5.2d	\n\t	fsub	v21.2d,v14.2d,v17.2d	\n\t"\
		"fadd	v4.2d ,v1.2d,v4.2d	\n\t	fadd	v17.2d,v14.2d,v17.2d	\n\t"\
		"stp	q6 ,q7 ,[x16      ]	\n\t	fsub	v22.2d,v13.2d,v18.2d	\n\t"\
		"stp	q8 ,q9 ,[x16,#0x20]	\n\t	fadd	v18.2d,v13.2d,v18.2d	\n\t"\
		"stp	q10,q4 ,[x16,#0x40]	\n\t	fsub	v23.2d,v15.2d,v19.2d	\n\t"\
		"stp	q5 ,q11,[x16,#0x60]	\n\t	fadd	v19.2d,v15.2d,v19.2d	\n\t"\
		/* Block 1:							Block 3: */\
		"ldp	q0 ,q1 ,[x4]		\n\t	stp	q16,q17,[x16,#0x80]		\n\t"\
		"ldp	q2 ,q3 ,[x6]		\n\t	stp	q20,q21,[x16,#0xa0]		\n\t"\
		"ldp	q4 ,q5 ,[x5]		\n\t	stp	q23,q18,[x16,#0xc0]		\n\t"\
		"ldp	q10,q11,[x7]		\n\t	stp	q19,q22,[x16,#0xe0]		\n\t"\
		"fmul	v8.2d,v4.2d,v30.2d	\n\t	ldp	q12,q13,[x12]			\n\t"\
		"fmul	v9.2d,v5.2d,v30.2d	\n\t	ldp	q14,q15,[x14]			\n\t"\
		"fmls	v8.2d,v5.2d,v31.2d	\n\t	ldp	q16,q17,[x13]			\n\t"\
		"fmla	v9.2d,v4.2d,v31.2d	\n\t	ldp	q18,q19,[x15]			\n\t"\
		"fmul	v6.2d,v10.2d,v31.2d	\n\t	fmul	v20.2d,v16.2d,v31.2d \n\t"\
		"fmul	v7.2d,v11.2d,v31.2d	\n\t	fmul	v21.2d,v17.2d,v31.2d \n\t"\
		"fmls	v6.2d,v11.2d,v30.2d	\n\t	fmls	v20.2d,v17.2d,v30.2d \n\t"\
		"fmla	v7.2d,v10.2d,v30.2d	\n\t	fmla	v21.2d,v16.2d,v30.2d \n\t"\
		"fsub	v4.2d,v8.2d,v6.2d	\n\t	fmul	v22.2d,v18.2d,v30.2d \n\t"\
		"fsub	v5.2d,v9.2d,v7.2d	\n\t	fmul	v23.2d,v19.2d,v30.2d \n\t"\
		"fadd	v6.2d,v8.2d,v6.2d	\n\t	fmls	v22.2d,v19.2d,v31.2d \n\t"\
		"fadd	v7.2d,v9.2d,v7.2d	\n\t	fmla	v23.2d,v18.2d,v31.2d \n\t"\
		"fsub	v8.2d,v2.2d,v3.2d	\n\t	fsub	v16.2d,v20.2d,v22.2d \n\t"\
		"fadd	v9.2d,v2.2d,v3.2d	\n\t	fsub	v17.2d,v21.2d,v23.2d \n\t"\
		"fmul	v8.2d,v8.2d,v29.2d	\n\t	fadd	v18.2d,v20.2d,v22.2d \n\t"\
		"fmul	v9.2d,v9.2d,v29.2d	\n\t	fadd	v19.2d,v21.2d,v23.2d \n\t"\
		"fadd	v2.2d,v0.2d,v8.2d	\n\t	fadd	v20.2d,v15.2d,v14.2d \n\t"\
		"fadd	v3.2d,v1.2d,v9.2d	\n\t	fsub	v21.2d,v15.2d,v14.2d \n\t"\
		"fsub	v0.2d,v0.2d,v8.2d	\n\t	fmul	v20.2d,v20.2d,v29.2d \n\t"\
		"fsub	v1.2d,v1.2d,v9.2d	\n\t	fmul	v21.2d,v21.2d,v29.2d \n\t"\
		"fadd	v8.2d,v2.2d,v6.2d	\n\t	fadd	v14.2d,v12.2d,v20.2d \n\t"\
		"fadd	v9.2d,v3.2d,v7.2d	\n\t	fadd	v15.2d,v13.2d,v21.2d \n\t"\
		"fsub	v2.2d,v2.2d,v6.2d	\n\t	fsub	v12.2d,v12.2d,v20.2d \n\t"\
		"fsub	v3.2d,v3.2d,v7.2d	\n\t	fsub	v13.2d,v13.2d,v21.2d \n\t"\
		"fadd	v10.2d,v0.2d,v5.2d	\n\t	fadd	v20.2d,v12.2d,v16.2d \n\t"\
		"fadd	v11.2d,v1.2d,v4.2d	\n\t	fadd	v21.2d,v13.2d,v17.2d \n\t"\
		"fsub	v0.2d,v0.2d,v5.2d	\n\t	fsub	v12.2d,v12.2d,v16.2d \n\t"\
		"fsub	v1.2d,v1.2d,v4.2d	\n\t	fsub	v13.2d,v13.2d,v17.2d \n\t"\
		"stp	q8 ,q9 ,[x16,#0x100]\n\t	fadd	v22.2d,v14.2d,v19.2d \n\t"\
		"stp	q2 ,q3 ,[x16,#0x120]\n\t	fadd	v23.2d,v15.2d,v18.2d \n\t"\
		"stp	q0 ,q11,[x16,#0x140]\n\t	fsub	v14.2d,v14.2d,v19.2d \n\t"\
		"stp	q10,q1 ,[x16,#0x160]\n\t	fsub	v15.2d,v15.2d,v18.2d \n\t"\
		"									stp	q20,q21,[x16,#0x180]	\n\t"\
		"									stp	q12,q13,[x16,#0x1a0]	\n\t"\
		"									stp	q14,q23,[x16,#0x1c0]	\n\t"\
		"									stp	q22,q15,[x16,#0x1e0]	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "m" (Xi1)\
		,[__i2] "m" (Xi2)\
		,[__i3] "m" (Xi3)\
		,[__i4] "m" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15","x16",\
			"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11",\
			"v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25", "v29","v30","v31"	/* Clobbered registers */\
	);\
	}

	/* With-twiddles out-of-place analog of above twiddleless DIT macro: 15 nontrivial complex input twiddles E1-f [E0 assumed = 1],
	The DIT version of this macro processes the twiddles in-order.
	NOTE: SINCE THIS MACRO IS SPECIFICALLY DESIGNED AS THE 2ND-PASS OF LARGE-POWER-OF-2-TWIDDLELESS DFT SYNTHESIS, THE
	"TWIDDLES" HERE ARE PURE OF THE DFT-INTERNAL VARIETY, AND THUS APPLIED TO THE INPUTS, JUST AS FOR THE ABOVE DIF COUNTERPART.

	Sincos layout: Two portions:

	Radix-16 shared consts anchored at isrt2:

	  isrt2 + 0x000;	cc0 + 0x010;	ss0 + 0x020;

	Per-block-specific set of 15 complex twiddles anchored at c1:

		c1  + 0x000;	s1  + 0x010;
		c2  + 0x020;	s2  + 0x030;
		c3  + 0x040;	s3  + 0x050;
		c4  + 0x060;	s4  + 0x070;
		c5  + 0x080;	s5  + 0x090;
		c6  + 0x0a0;	s6  + 0x0b0;
		c7  + 0x0c0;	s7  + 0x0d0;
		c8  + 0x0e0;	s8  + 0x0f0;
		c9  + 0x100;	s9  + 0x110;
		c10 + 0x120;	s10 + 0x130;
		c11 + 0x140;	s11 + 0x150;
		c12 + 0x160;	s12 + 0x170;
		c13 + 0x180;	s13 + 0x190;
		c14 + 0x1a0;	s14 + 0x1b0;
		c15 + 0x1c0;	s15 + 0x1d0;

	Use radix-16 DIF as template for DIT/OOP here, since need a pre-twiddles algorithm:
	*/
	#define SSE2_RADIX16_DIT_TWIDDLE_OOP(Xin0,Xi1,Xi2,Xi3,Xi4, Xout0,Xo1,Xo2,Xo3,Xo4, Xisrt2,Xc1)\
	{\
	__asm__ volatile (\
	/*...Blocks 0,1: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"ldr	x0,%[__in0]			\n\t	ldr	w15,%[__i4]			\n\t"\
		"ldr	w1,%[__i1]			\n\t	add	x1 ,x1,x0			\n\t"\
		"ldr	w2,%[__i2]			\n\t	add	x2 ,x2,x0			\n\t"\
		"ldr	w3,%[__i3]			\n\t	add	x3 ,x3,x0			\n\t"\
		/* Compute rcol addresses in lcol here, to mix adds and loads: */\
		"add	x4,x0,x15			\n\t	ldr	x16,%[__c1]			\n\t"\
		"add	x5,x1,x15			\n\t	ldp	q16,q17,[x16,#0x60]	\n\t"/* c4 */\
		"add	x6,x2,x15			\n\t	ldp	q12,q13,[x4]				\n\t"\
		"add	x7,x3,x15			\n\t	ldp	q14,q15,[x5]				\n\t"\
/* c1 */"ldp	q8,q9,[x16]			\n\t	fmul	v18.2d,v12.2d,v16.2d	\n\t"\
		"ldp	q0,q1,[x0]			\n\t	fmul	v19.2d,v13.2d,v16.2d	\n\t"\
		"ldp	q2,q3,[x1]			\n\t	fmla	v18.2d,v13.2d,v17.2d	\n\t"\
		"fmul	v4.2d,v2.2d,v8.2d	\n\t	fmls	v19.2d,v12.2d,v17.2d	\n\t"\
		"fmul	v5.2d,v3.2d,v8.2d	\n\t	ldp	q16,q17,[x16,#0x80]	\n\t"/* c5 */\
		"fmla	v4.2d,v3.2d,v9.2d	\n\t	fmul	v12.2d,v14.2d,v16.2d	\n\t"\
		"fmls	v5.2d,v2.2d,v9.2d	\n\t	fmul	v13.2d,v15.2d,v16.2d	\n\t"\
		"fsub	v2.2d,v0.2d,v4.2d	\n\t	fmla	v12.2d,v15.2d,v17.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v5.2d	\n\t	fmls	v13.2d,v14.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fsub	v14.2d,v18.2d,v12.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fsub	v15.2d,v19.2d,v13.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t	fadd	v12.2d,v18.2d,v12.2d	\n\t"\
		"ldp	q6,q7,[x3]			\n\t	fadd	v13.2d,v19.2d,v13.2d	\n\t"\
/* c2 */"ldp	q8,q9,[x16,#0x20]	\n\t	ldp	q16,q17,[x6]				\n\t"\
		"fmul	v10.2d,v4.2d,v8.2d	\n\t	ldp	q18,q19,[x7]				\n\t"\
		"fmul	v11.2d,v5.2d,v8.2d	\n\t	ldp	q20,q21,[x16,#0xa0]	\n\t"/* c6 */\
		"fmla	v10.2d,v5.2d,v9.2d	\n\t	fmul	v22.2d,v16.2d,v20.2d	\n\t"\
		"fmls	v11.2d,v4.2d,v9.2d	\n\t	fmul	v23.2d,v17.2d,v20.2d	\n\t"\
/* c3 */"ldp	q8,q9,[x16,#0x40]	\n\t	fmla	v22.2d,v17.2d,v21.2d	\n\t"\
		"fmul	v4.2d,v6.2d,v8.2d	\n\t	fmls	v23.2d,v16.2d,v21.2d	\n\t"\
		"fmul	v5.2d,v7.2d,v8.2d	\n\t	ldp	q20,q21,[x16,#0xc0]	\n\t"/* c7 */\
		"fmla	v4.2d,v7.2d,v9.2d	\n\t	fmul	v16.2d,v18.2d,v20.2d	\n\t"\
		"fmls	v5.2d,v6.2d,v9.2d	\n\t	fmul	v17.2d,v19.2d,v20.2d	\n\t"\
		"fsub	v6.2d,v10.2d,v4.2d	\n\t	fmla	v16.2d,v19.2d,v21.2d	\n\t"\
		"fsub	v7.2d,v11.2d,v5.2d	\n\t	fmls	v17.2d,v18.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v10.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d,v11.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		/* combine to get 2 length-4 */"	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t"/* output subtransforms... */\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"stp	q8,q9,[x2]			\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	stp	q20,q21,[x6]				\n\t"\
		"stp	q0,q1,[x0]			\n\t	fadd	v17.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v16.2d,v15.2d,v18.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	stp	q12,q13,[x4]				\n\t"\
		"stp	q4,q5,[x1]			\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"stp	q2,q3,[x3]			\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
		"									stp	q17,q16,[x5]				\n\t"\
		"									stp	q14,q15,[x7]				\n\t"\
	/*...Blocks 2,3: */\
		"add	x16,x16,#0xe0		\n\t"\
		/* x8-11 = in0 + i8-11: */\
		"add	x8 ,x4,x15			\n\t"	/* x12-15 = in0 + i12-15: */\
		"add	x9 ,x5,x15			\n\t	add	x12,x8 ,x15			\n\t"\
		"add	x10,x6,x15			\n\t	add	x13,x9 ,x15			\n\t"\
		"add	x11,x7,x15			\n\t	add	x14,x10,x15			\n\t"\
/* c8 */"ldp	q8,q9,[x16]			\n\t	add	x15,x11,x15			\n\t"\
		"ldp	q4,q5,[x8]			\n\t"\
		"ldp	q2,q3,[x9]			\n\t	ldp	q16,q17,[x16,#0x80]	\n\t"/* c12 */\
		"fmul	v0.2d,v4.2d,v8.2d	\n\t	ldp	q12,q13,[x12]				\n\t"\
		"fmul	v1.2d,v5.2d,v8.2d	\n\t	ldp	q14,q15,[x13]				\n\t"\
		"fmla	v0.2d,v5.2d,v9.2d	\n\t	fmul	v18.2d,v12.2d,v16.2d	\n\t"\
		"fmls	v1.2d,v4.2d,v9.2d	\n\t	fmul	v19.2d,v13.2d,v16.2d	\n\t"\
/* c9 */"ldp	q8,q9,[x16,#0x20]	\n\t	fmla	v18.2d,v13.2d,v17.2d	\n\t"\
		"fmul	v4.2d,v2.2d,v8.2d	\n\t	fmls	v19.2d,v12.2d,v17.2d	\n\t"\
		"fmul	v5.2d,v3.2d,v8.2d	\n\t	ldp	q16,q17,[x16,#0xa0]	\n\t"/* c13 */\
		"fmla	v4.2d,v3.2d,v9.2d	\n\t	fmul	v12.2d,v14.2d,v16.2d	\n\t"\
		"fmls	v5.2d,v2.2d,v9.2d	\n\t	fmul	v13.2d,v15.2d,v16.2d	\n\t"\
		"fsub	v2.2d,v0.2d,v4.2d	\n\t	fmla	v12.2d,v15.2d,v17.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v5.2d	\n\t	fmls	v13.2d,v14.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fsub	v14.2d,v18.2d,v12.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fsub	v15.2d,v19.2d,v13.2d	\n\t"\
		"ldp	q4,q5,[x10]			\n\t	fadd	v12.2d,v18.2d,v12.2d	\n\t"\
		"ldp	q6,q7,[x11]			\n\t	fadd	v13.2d,v19.2d,v13.2d	\n\t"\
/* c10*/"ldp	q8,q9,[x16,#0x40]	\n\t	ldp	q16,q17,[x14]				\n\t"\
		"fmul	v10.2d,v4.2d,v8.2d	\n\t	ldp	q18,q19,[x15]				\n\t"\
		"fmul	v11.2d,v5.2d,v8.2d	\n\t	ldp	q20,q21,[x16,#0xc0]	\n\t"/* c14 */\
		"fmla	v10.2d,v5.2d,v9.2d	\n\t	fmul	v22.2d,v16.2d,v20.2d	\n\t"\
		"fmls	v11.2d,v4.2d,v9.2d	\n\t	fmul	v23.2d,v17.2d,v20.2d	\n\t"\
/* c11*/"ldp	q8,q9,[x16,#0x60]	\n\t	fmla	v22.2d,v17.2d,v21.2d	\n\t"\
		"fmul	v4.2d,v6.2d,v8.2d	\n\t	fmls	v23.2d,v16.2d,v21.2d	\n\t"\
		"fmul	v5.2d,v7.2d,v8.2d	\n\t	ldp	q20,q21,[x16,#0xe0]	\n\t"/* c15 */\
		"fmla	v4.2d,v7.2d,v9.2d	\n\t	fmul	v16.2d,v18.2d,v20.2d	\n\t"\
		"fmls	v5.2d,v6.2d,v9.2d	\n\t	fmul	v17.2d,v19.2d,v20.2d	\n\t"\
		"fsub	v6.2d,v10.2d,v4.2d	\n\t	fmla	v16.2d,v19.2d,v21.2d	\n\t"\
		"fsub	v7.2d,v11.2d,v5.2d	\n\t	fmls	v17.2d,v18.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v10.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d,v11.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		/* combine to get 2 length-4 */"	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t"/* output subtransforms... */\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"stp	q8,q9,[x10]			\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	stp	q20,q21,[x14]				\n\t"\
		"stp	q0,q1,[x8]			\n\t	fadd	v17.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v16.2d,v15.2d,v18.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	stp	q12,q13,[x12]				\n\t"\
		"stp	q4,q5,[x9]			\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"stp	q2,q3,[x11]			\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
		"									stp	q17,q16,[x13]				\n\t"\
		"									stp	q14,q15,[x15]				\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
		"ldr	x16,%[__isrt2]		\n\t	ld1r	{v29.2d},[x16]	\n\t"\
		"ldp	q30,q31,[x16,#0x10]	\n\t"/* cc0,ss0 */\
		"ldr	w16,%[__o4]			\n\t"/* use internal-twiddles-pointer-reg for o1-offset since all twiddles loaded into v-regs */\
		/* Block 0:			Block 2 - this is hacked from the 2nd half of SSE2_RADIX16_DIT_0TWIDDLE,
											hence the effed-up rcol vreg-indexing (e.g. v16,v28): */\
		"ldp	q8 ,q9 ,[x0 ]		\n\t	ldr	x0,%[__out0]		\n\t"\
		"ldp	q0 ,q1 ,[x4 ]		\n\t	add	x4 ,x0 ,x16			\n\t"\
		"ldp	q10,q11,[x8 ]		\n\t	add	x8 ,x4 ,x16			\n\t"\
		"ldp	q4 ,q5 ,[x12]		\n\t	add	x12,x8 ,x16			\n\t"\
		"									ldr	w16,%[__o2]			\n\t"\
/* 20,21 have 0,1 */"ldp	q20,q21,[x2 ]	\n\t	add	x2 ,x0 ,x16		\n\t"\
/* 12,13 have 2,3 */"ldp	q12,q13,[x6 ]	\n\t	add	x6 ,x4 ,x16		\n\t"\
/* 22,23 have 4,5 */"ldp	q22,q23,[x10]	\n\t	add	x10,x8 ,x16		\n\t"\
/* 16,28 have 6,7 */"ldp	q16,q28,[x14]	\n\t	add	x14,x12,x16		\n\t"\
		"									fadd	v18.2d,v23.2d,v22.2d	\n\t"\
		"									fsub	v19.2d,v23.2d,v22.2d	\n\t"\
		"									fadd	v17.2d,v16.2d,v28.2d	\n\t"\
		"									fsub	v16.2d,v16.2d,v28.2d	\n\t"\
		"fsub	v2.2d ,v8.2d ,v0.2d	\n\t	fmul	v18.2d,v29.2d,v18.2d	\n\t"\
		"fsub	v3.2d ,v9.2d ,v1.2d	\n\t	fmul	v19.2d,v29.2d,v19.2d	\n\t"\
		"fadd	v0.2d ,v8.2d ,v0.2d	\n\t	fmul	v16.2d,v29.2d,v16.2d	\n\t"\
		"fadd	v1.2d ,v9.2d ,v1.2d	\n\t	fmul	v17.2d,v29.2d,v17.2d	\n\t"\
		"fsub	v6.2d ,v10.2d,v4.2d	\n\t	fsub	v22.2d,v18.2d,v16.2d	\n\t"\
		"fsub	v7.2d ,v11.2d,v5.2d	\n\t	fadd	v23.2d,v18.2d,v16.2d	\n\t"\
		"fadd	v4.2d ,v10.2d,v4.2d	\n\t	fsub	v16.2d,v19.2d,v17.2d	\n\t"\
		"fadd	v5.2d ,v11.2d,v5.2d	\n\t	fadd	v17.2d,v19.2d,v17.2d	\n\t"\
		"fsub	v8.2d ,v0.2d,v4.2d	\n\t	fsub	v18.2d,v20.2d,v13.2d	\n\t"\
		"fsub	v9.2d ,v1.2d,v5.2d	\n\t	fadd	v19.2d,v20.2d,v13.2d	\n\t"\
		"fadd	v4.2d ,v0.2d,v4.2d	\n\t	fsub	v20.2d,v21.2d,v12.2d	\n\t"\
		"fadd	v5.2d ,v1.2d,v5.2d	\n\t	fadd	v21.2d,v21.2d,v12.2d	\n\t"\
		"fsub	v10.2d,v2.2d,v7.2d	\n\t	fsub	v12.2d,v18.2d,v17.2d	\n\t"\
		"fsub	v11.2d,v3.2d,v6.2d	\n\t	fadd	v13.2d,v18.2d,v17.2d	\n\t"\
		"fadd	v7.2d ,v2.2d,v7.2d	\n\t	fsub	v18.2d,v19.2d,v22.2d	\n\t"\
		"fadd	v6.2d ,v3.2d,v6.2d	\n\t	fadd	v19.2d,v19.2d,v22.2d	\n\t"\
		"stp	q4 ,q5 ,[x0 ]		\n\t	fsub	v14.2d,v20.2d,v16.2d	\n\t"\
		"stp	q7 ,q11,[x4 ]		\n\t	fadd	v15.2d,v20.2d,v16.2d	\n\t"\
		"stp	q8 ,q9 ,[x8 ]		\n\t	fsub	v20.2d,v21.2d,v23.2d	\n\t"\
		"stp	q10,q6 ,[x12]		\n\t	fadd	v21.2d,v21.2d,v23.2d	\n\t"\
		/* Block 1: */\
		"ldr	w16,%[__o1]			\n\t"\
		"ldp	q10,q11,[x1 ]	\n\t	add	x1 ,x0 ,x16	\n\t	stp	q19,q15,[x2 ]		\n\t"\
		"ldp	q2 ,q3 ,[x5 ]	\n\t	add	x5 ,x4 ,x16	\n\t	stp	q13,q20,[x6 ]		\n\t"\
		"ldp	q6 ,q7 ,[x9 ]	\n\t	add	x9 ,x8 ,x16	\n\t	stp	q18,q14,[x10]		\n\t"\
		"ldp	q8 ,q9 ,[x13]	\n\t	add	x13,x12,x16	\n\t	stp	q12,q21,[x14]		\n\t"\
											/* Block 3: */\
		"fmul	v0.2d,v8.2d,v31.2d	\n\t	ldp	q22,q23,[x3 ]	\n\t	add	x3 ,x2 ,x16	\n\t"\
		"fmul	v1.2d,v9.2d,v31.2d	\n\t	ldp	q14,q15,[x7 ]	\n\t	add	x7 ,x6 ,x16	\n\t"\
		"fmla	v0.2d,v9.2d,v30.2d	\n\t	ldp	q18,q19,[x11]	\n\t	add	x11,x10,x16	\n\t"\
		"fmls	v1.2d,v8.2d,v30.2d	\n\t	ldp	q20,q21,[x15]	\n\t	add	x15,x14,x16	\n\t"\
		"fmul	v8.2d,v6.2d,v30.2d	\n\t	fmul	v12.2d,v20.2d,v30.2d	\n\t"\
		"fmul	v9.2d,v7.2d,v30.2d	\n\t	fmul	v13.2d,v21.2d,v30.2d	\n\t"\
		"fmla	v8.2d,v7.2d,v31.2d	\n\t	fmla	v12.2d,v21.2d,v31.2d	\n\t"\
		"fmls	v9.2d,v6.2d,v31.2d	\n\t	fmls	v13.2d,v20.2d,v31.2d	\n\t"\
		"fadd	v4.2d,v8.2d,v0.2d	\n\t	fmul	v20.2d,v18.2d,v31.2d	\n\t"\
		"fadd	v5.2d,v9.2d,v1.2d	\n\t	fmul	v21.2d,v19.2d,v31.2d	\n\t"\
		"fsub	v6.2d,v8.2d,v0.2d	\n\t	fmla	v20.2d,v19.2d,v30.2d	\n\t"\
		"fsub	v7.2d,v9.2d,v1.2d	\n\t	fmls	v21.2d,v18.2d,v30.2d	\n\t"\
		"fadd	v8.2d,v2.2d,v3.2d	\n\t	fadd	v16.2d,v20.2d,v12.2d	\n\t"\
		"fsub	v9.2d,v3.2d,v2.2d	\n\t	fadd	v17.2d,v21.2d,v13.2d	\n\t"\
		"fmul	v8.2d,v8.2d,v29.2d	\n\t	fsub	v18.2d,v20.2d,v12.2d	\n\t"\
		"fmul	v9.2d,v9.2d,v29.2d	\n\t	fsub	v19.2d,v21.2d,v13.2d	\n\t"\
		"fsub	v0.2d,v10.2d,v8.2d	\n\t	fsub	v20.2d,v14.2d,v15.2d	\n\t"\
		"fsub	v1.2d,v11.2d,v9.2d	\n\t	fadd	v21.2d,v15.2d,v14.2d	\n\t"\
		"fadd	v2.2d,v10.2d,v8.2d	\n\t	fmul	v20.2d,v20.2d,v29.2d	\n\t"\
		"fadd	v3.2d,v11.2d,v9.2d	\n\t	fmul	v21.2d,v21.2d,v29.2d	\n\t"\
		"fadd	v8.2d,v2.2d,v4.2d	\n\t	fsub	v12.2d,v22.2d,v20.2d	\n\t"\
		"fadd	v9.2d,v3.2d,v5.2d	\n\t	fsub	v13.2d,v23.2d,v21.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v4.2d	\n\t	fadd	v14.2d,v22.2d,v20.2d	\n\t"\
		"fsub	v3.2d,v3.2d,v5.2d	\n\t	fadd	v15.2d,v23.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v0.2d,v7.2d	\n\t	fadd	v20.2d,v12.2d,v18.2d	\n\t"\
		"fadd	v5.2d,v1.2d,v6.2d	\n\t	fadd	v21.2d,v13.2d,v19.2d	\n\t"\
		"fsub	v0.2d,v0.2d,v7.2d	\n\t	fsub	v12.2d,v12.2d,v18.2d	\n\t"\
		"fsub	v1.2d,v1.2d,v6.2d	\n\t	fsub	v13.2d,v13.2d,v19.2d	\n\t"\
		"stp	q8 ,q9 ,[x1 ]		\n\t	fadd	v18.2d,v14.2d,v17.2d	\n\t"\
		"stp	q2 ,q3 ,[x9 ]		\n\t	fadd	v19.2d,v15.2d,v16.2d	\n\t"\
		"stp	q4 ,q1 ,[x5 ]		\n\t	fsub	v14.2d,v14.2d,v17.2d	\n\t"\
		"stp	q0 ,q5 ,[x13]		\n\t	fsub	v15.2d,v15.2d,v16.2d	\n\t"\
		"									stp	q20,q21,[x3 ]		\n\t"\
		"									stp	q12,q13,[x11]		\n\t"\
		"									stp	q18,q15,[x7 ]		\n\t"\
		"									stp	q14,q19,[x15]		\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__c1] "m" (Xc1)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15","x16",\
			"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11",\
			"v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v22","v23", "v28","v29","v30","v31"	/* Clobbered registers */\
	);\
	}

	// DIF version of above shares same sincos layout & data:
	#define SSE2_RADIX16_DIF_TWIDDLE_OOP(Xin0,Xi1,Xi4, Xout0,Xoff, Xisrt2,Xc1)\
	{\
	__asm__ volatile (\
	/*...Blocks 0,1: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"ldr	x0,%[__in0]		\n\t	ldr	w3,%[__i1]	\n\t	ldr	w15,%[__i4]	\n\t"\
		"add	x1 ,x3,x0		\n\t"/* x1 = __in0 +   istride */\
		"add	x2 ,x3,x1		\n\t"/* x2 = __in0 + 2*istride */\
		"add	x3 ,x3,x2		\n\t"/* x3 = __in0 + 3*istride */\
		/* Compute rcol addresses in lcol here, to mix adds and loads: */\
		"add	x4,x0,x15			\n\t	ldr	x16,%[__c1]			\n\t"\
		"add	x5,x1,x15			\n\t	ldp	q16,q17,[x16,#0x60]	\n\t"/* c4 */\
		"add	x6,x2,x15			\n\t	ldp	q12,q13,[x4]				\n\t"\
		"add	x7,x3,x15			\n\t	ldp	q14,q15,[x5]				\n\t"\
/* c1 */"ldp	q8,q9,[x16]			\n\t	fmul	v18.2d,v12.2d,v16.2d	\n\t"\
		"ldp	q0,q1,[x0]			\n\t	fmul	v19.2d,v13.2d,v16.2d	\n\t"\
		"ldp	q2,q3,[x1]			\n\t	fmls	v18.2d,v13.2d,v17.2d	\n\t"\
		"fmul	v4.2d,v2.2d,v8.2d	\n\t	fmla	v19.2d,v12.2d,v17.2d	\n\t"\
		"fmul	v5.2d,v3.2d,v8.2d	\n\t	ldp	q16,q17,[x16,#0x80]	\n\t"/* c5 */\
		"fmls	v4.2d,v3.2d,v9.2d	\n\t	fmul	v12.2d,v14.2d,v16.2d	\n\t"\
		"fmla	v5.2d,v2.2d,v9.2d	\n\t	fmul	v13.2d,v15.2d,v16.2d	\n\t"\
		"fsub	v2.2d,v0.2d,v4.2d	\n\t	fmls	v12.2d,v15.2d,v17.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v5.2d	\n\t	fmla	v13.2d,v14.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fsub	v14.2d,v18.2d,v12.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fsub	v15.2d,v19.2d,v13.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t	fadd	v12.2d,v18.2d,v12.2d	\n\t"\
		"ldp	q6,q7,[x3]			\n\t	fadd	v13.2d,v19.2d,v13.2d	\n\t"\
/* c2 */"ldp	q8,q9,[x16,#0x20]	\n\t	ldp	q16,q17,[x6]				\n\t"\
		"fmul	v10.2d,v4.2d,v8.2d	\n\t	ldp	q18,q19,[x7]				\n\t"\
		"fmul	v11.2d,v5.2d,v8.2d	\n\t	ldp	q20,q21,[x16,#0xa0]	\n\t"/* c6 */\
		"fmls	v10.2d,v5.2d,v9.2d	\n\t	fmul	v22.2d,v16.2d,v20.2d	\n\t"\
		"fmla	v11.2d,v4.2d,v9.2d	\n\t	fmul	v23.2d,v17.2d,v20.2d	\n\t"\
/* c3 */"ldp	q8,q9,[x16,#0x40]	\n\t	fmls	v22.2d,v17.2d,v21.2d	\n\t"\
		"fmul	v4.2d,v6.2d,v8.2d	\n\t	fmla	v23.2d,v16.2d,v21.2d	\n\t"\
		"fmul	v5.2d,v7.2d,v8.2d	\n\t	ldp	q20,q21,[x16,#0xc0]	\n\t"/* c7 */\
		"fmls	v4.2d,v7.2d,v9.2d	\n\t	fmul	v16.2d,v18.2d,v20.2d	\n\t"\
		"fmla	v5.2d,v6.2d,v9.2d	\n\t	fmul	v17.2d,v19.2d,v20.2d	\n\t"\
		"fsub	v6.2d,v10.2d,v4.2d	\n\t	fmls	v16.2d,v19.2d,v21.2d	\n\t"\
		"fsub	v7.2d,v11.2d,v5.2d	\n\t	fmla	v17.2d,v18.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v10.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d,v11.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		/* combine to get 2 length-4 */"	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t"/* output subtransforms... */\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"stp	q8,q9,[x1]			\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	stp	q20,q21,[x5]				\n\t"\
		"stp	q0,q1,[x0]			\n\t	fadd	v17.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v16.2d,v15.2d,v18.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	stp	q12,q13,[x4]				\n\t"\
		"stp	q4,q5,[x3]			\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"stp	q2,q3,[x2]			\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
		"									stp	q17,q16,[x7]				\n\t"\
		"									stp	q14,q15,[x6]				\n\t"\
	/*...Blocks 2,3: */\
		"add	x16,x16,#0xe0		\n\t"\
		/* x8-11 = in0 + i8-11: */\
		"add	x0 ,x4,x15			\n\t"	/* x12-15 = in0 + i12-15: */\
		"add	x1 ,x5,x15			\n\t	add	x4 ,x0 ,x15			\n\t"\
		"add	x2 ,x6,x15			\n\t	add	x5 ,x1 ,x15			\n\t"\
		"add	x3 ,x7,x15			\n\t	add	x6 ,x2 ,x15			\n\t"\
/* c8 */"ldp	q8,q9,[x16]			\n\t	add	x7 ,x3 ,x15			\n\t"\
		"ldp	q4,q5,[x0]			\n\t"\
		"ldp	q2,q3,[x1]			\n\t	ldp	q16,q17,[x16,#0x80]	\n\t"/* c12 */\
		"fmul	v0.2d,v4.2d,v8.2d	\n\t	ldp	q12,q13,[x4]				\n\t"\
		"fmul	v1.2d,v5.2d,v8.2d	\n\t	ldp	q14,q15,[x5]				\n\t"\
		"fmls	v0.2d,v5.2d,v9.2d	\n\t	fmul	v18.2d,v12.2d,v16.2d	\n\t"\
		"fmla	v1.2d,v4.2d,v9.2d	\n\t	fmul	v19.2d,v13.2d,v16.2d	\n\t"\
/* c9 */"ldp	q8,q9,[x16,#0x20]	\n\t	fmls	v18.2d,v13.2d,v17.2d	\n\t"\
		"fmul	v4.2d,v2.2d,v8.2d	\n\t	fmla	v19.2d,v12.2d,v17.2d	\n\t"\
		"fmul	v5.2d,v3.2d,v8.2d	\n\t	ldp	q16,q17,[x16,#0xa0]	\n\t"/* c13 */\
		"fmls	v4.2d,v3.2d,v9.2d	\n\t	fmul	v12.2d,v14.2d,v16.2d	\n\t"\
		"fmla	v5.2d,v2.2d,v9.2d	\n\t	fmul	v13.2d,v15.2d,v16.2d	\n\t"\
		"fsub	v2.2d,v0.2d,v4.2d	\n\t	fmls	v12.2d,v15.2d,v17.2d	\n\t"\
		"fsub	v3.2d,v1.2d,v5.2d	\n\t	fmla	v13.2d,v14.2d,v17.2d	\n\t"\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fsub	v14.2d,v18.2d,v12.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fsub	v15.2d,v19.2d,v13.2d	\n\t"\
		"ldp	q4,q5,[x2]			\n\t	fadd	v12.2d,v18.2d,v12.2d	\n\t"\
		"ldp	q6,q7,[x3]			\n\t	fadd	v13.2d,v19.2d,v13.2d	\n\t"\
/* c10*/"ldp	q8,q9,[x16,#0x40]	\n\t	ldp	q16,q17,[x6]				\n\t"\
		"fmul	v10.2d,v4.2d,v8.2d	\n\t	ldp	q18,q19,[x7]				\n\t"\
		"fmul	v11.2d,v5.2d,v8.2d	\n\t	ldp	q20,q21,[x16,#0xc0]	\n\t"/* c14 */\
		"fmls	v10.2d,v5.2d,v9.2d	\n\t	fmul	v22.2d,v16.2d,v20.2d	\n\t"\
		"fmla	v11.2d,v4.2d,v9.2d	\n\t	fmul	v23.2d,v17.2d,v20.2d	\n\t"\
/* c11*/"ldp	q8,q9,[x16,#0x60]	\n\t	fmls	v22.2d,v17.2d,v21.2d	\n\t"\
		"fmul	v4.2d,v6.2d,v8.2d	\n\t	fmla	v23.2d,v16.2d,v21.2d	\n\t"\
		"fmul	v5.2d,v7.2d,v8.2d	\n\t	ldp	q20,q21,[x16,#0xe0]	\n\t"/* c15 */\
		"fmls	v4.2d,v7.2d,v9.2d	\n\t	fmul	v16.2d,v18.2d,v20.2d	\n\t"\
		"fmla	v5.2d,v6.2d,v9.2d	\n\t	fmul	v17.2d,v19.2d,v20.2d	\n\t"\
		"fsub	v6.2d,v10.2d,v4.2d	\n\t	fmls	v16.2d,v19.2d,v21.2d	\n\t"\
		"fsub	v7.2d,v11.2d,v5.2d	\n\t	fmla	v17.2d,v18.2d,v21.2d	\n\t"\
		"fadd	v4.2d,v10.2d,v4.2d	\n\t	fsub	v18.2d,v22.2d,v16.2d	\n\t"\
		"fadd	v5.2d,v11.2d,v5.2d	\n\t	fsub	v19.2d,v23.2d,v17.2d	\n\t"\
		/* combine to get 2 length-4 */"	fadd	v16.2d,v22.2d,v16.2d	\n\t"\
		"fsub	v8.2d,v0.2d,v4.2d	\n\t	fadd	v17.2d,v23.2d,v17.2d	\n\t"\
		"fsub	v9.2d,v1.2d,v5.2d	\n\t"/* output subtransforms... */\
		"fadd	v0.2d,v0.2d,v4.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v1.2d,v1.2d,v5.2d	\n\t	fsub	v21.2d,v13.2d,v17.2d	\n\t"\
		"stp	q8,q9,[x1]			\n\t	fadd	v12.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v4.2d,v2.2d,v7.2d	\n\t	fadd	v13.2d,v13.2d,v17.2d	\n\t"\
		"fsub	v5.2d,v3.2d,v6.2d	\n\t	stp	q20,q21,[x5]				\n\t"\
		"stp	q0,q1,[x0]			\n\t	fadd	v17.2d,v14.2d,v19.2d	\n\t"\
		"fsub	v2.2d,v2.2d,v7.2d	\n\t	fsub	v16.2d,v15.2d,v18.2d	\n\t"\
		"fadd	v3.2d,v3.2d,v6.2d	\n\t	stp	q12,q13,[x4]				\n\t"\
		"stp	q4,q5,[x3]			\n\t	fsub	v14.2d,v14.2d,v19.2d	\n\t"\
		"stp	q2,q3,[x2]			\n\t	fadd	v15.2d,v15.2d,v18.2d	\n\t"\
		"									stp	q17,q16,[x7]				\n\t"\
		"									stp	q14,q15,[x6]				\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
		"ldr	x16,%[__isrt2]		\n\t	ld1r	{v29.2d},[x16]	\n\t"\
		"ldp	q30,q31,[x16,#0x10]	\n\t"/* cc0,ss0 - Done with x16-for-twiddles, re-use below for O-addresses */\
/* Recompute __in0 + [0,4,8,12]*istride into x0-3, __in0 + [1,5,9,13]*istride into x4-7. 4*istride still in x15.
Use x0-7 for I-addresses, x8-15 for O-addresses - by the time we need x15 for outf, are done with I-addressing: */\
		"ldr	x0,%[__in0]		\n\t"/* __in0 + [0,4,8,12]*istride */\
		"add	x1,x15,x0		\n\t"\
		"add	x2,x15,x1		\n\t"\
		"add	x3,x15,x2		\n\t	ldr	w15,%[__i1]	\n\t"/* Done with __i4, re-use x15 for __i1 */\
		"add	x4,x15,x0		\n\t"/* __in0 + [1,5,9,13]*istride */\
		"add	x5,x15,x1		\n\t"\
		"add	x6,x15,x2		\n\t"\
		"add	x7,x15,x3		\n\t	add x15,x15,x15	\n\t"/* Done with __i1, re-use x15 for __i2 */\
	/* Load output base-address into x16 and offset-array pointer into x17: */\
		"ldr	x16,%[__out0]	\n\t	ldr	x17,%[__off]		\n\t"\
		"ldp	w8 ,w9 ,[x17]	\n\t	ldp	w10,w11,[x17,#0x08]	\n\t"/* off0-3 */\
		/* Block 0:			Block 2 - this is hacked from the 2nd half of SSE2_RADIX16_DIF_0TWIDDLE_B: */\
		"ldp	q8 ,q9 ,[x0]	\n\t	add	x8 ,x16,x8 ,lsl #3	\n\t"/* out0-3 = (double *)out0 + o0-3 */\
		"ldp	q2 ,q3 ,[x1]	\n\t	add	x9 ,x16,x9 ,lsl #3	\n\t"\
		"ldp	q10,q11,[x2]	\n\t	add	x10,x16,x10,lsl #3	\n\t"\
		"ldp	q6 ,q7 ,[x3]	\n\t	add	x11,x16,x11,lsl #3	\n\t"\
		"ldp	w12,w13,[x17,#0x10]		\n\t	ldp	w14,w15,[x17,#0x18]		\n\t"/* off4-7 */\
/* 22,23 have 0,1 */"ldp	q22,q23,[x4]	\n\t	add	x12,x16,x12,lsl #3	\n\t"/* out4-7 */\
/* 24,25 have 2,3 */"ldp	q24,q25,[x5]	\n\t	add	x13,x16,x13,lsl #3	\n\t"\
/* 12,13 have 4,5 */"ldp	q12,q13,[x6]	\n\t	add	x14,x16,x14,lsl #3	\n\t"\
/* 14,15 have 6,7 */"ldp	q14,q15,[x7]	\n\t	add	x15,x16,x15,lsl #3	\n\t"\
		"									fsub	v16.2d,v12.2d,v13.2d	\n\t"\
		"									fadd	v17.2d,v12.2d,v13.2d	\n\t"\
		"									fadd	v18.2d,v15.2d,v14.2d	\n\t"\
		"									fsub	v19.2d,v15.2d,v14.2d	\n\t"\
		"fsub	v0.2d ,v8.2d ,v2.2d	\n\t	fmul	v20.2d,v29.2d,v16.2d	\n\t"\
		"fsub	v1.2d ,v9.2d ,v3.2d	\n\t	fmul	v21.2d,v29.2d,v17.2d	\n\t"\
		"fadd	v2.2d ,v8.2d ,v2.2d	\n\t	fmul	v18.2d,v29.2d,v18.2d	\n\t"\
		"fadd	v3.2d ,v9.2d ,v3.2d	\n\t	fmul	v19.2d,v29.2d,v19.2d	\n\t"\
		"fsub	v4.2d ,v10.2d,v6.2d	\n\t	fsub	v12.2d,v22.2d,v25.2d	\n\t"\
		"fsub	v5.2d ,v11.2d,v7.2d	\n\t	fsub	v13.2d,v23.2d,v24.2d	\n\t"\
		"fadd	v6.2d ,v10.2d,v6.2d	\n\t	fadd	v14.2d,v23.2d,v24.2d	\n\t"\
		"fadd	v7.2d ,v11.2d,v7.2d	\n\t	fadd	v15.2d,v22.2d,v25.2d	\n\t"\
		"fsub	v8.2d ,v2.2d,v6.2d	\n\t	fsub	v16.2d,v20.2d,v18.2d	\n\t"\
		"fsub	v9.2d ,v3.2d,v7.2d	\n\t	fsub	v17.2d,v21.2d,v19.2d	\n\t"\
		"fadd	v6.2d ,v2.2d,v6.2d	\n\t	fadd	v18.2d,v20.2d,v18.2d	\n\t"\
		"fadd	v7.2d ,v3.2d,v7.2d	\n\t	fadd	v19.2d,v21.2d,v19.2d	\n\t"\
		"fsub	v10.2d,v0.2d,v5.2d	\n\t	fsub	v20.2d,v12.2d,v16.2d	\n\t"\
		"fsub	v11.2d,v1.2d,v4.2d	\n\t	fadd	v16.2d,v12.2d,v16.2d	\n\t"\
		"fadd	v5.2d ,v0.2d,v5.2d	\n\t	fsub	v21.2d,v14.2d,v17.2d	\n\t"\
		"fadd	v4.2d ,v1.2d,v4.2d	\n\t	fadd	v17.2d,v14.2d,v17.2d	\n\t"\
		"stp	q6 ,q7 ,[x8 ]		\n\t	fsub	v22.2d,v13.2d,v18.2d	\n\t"\
		"stp	q8 ,q9 ,[x9 ]		\n\t	fadd	v18.2d,v13.2d,v18.2d	\n\t"\
		"stp	q10,q4 ,[x10]		\n\t	fsub	v23.2d,v15.2d,v19.2d	\n\t"\
		"stp	q5 ,q11,[x11]		\n\t	fadd	v19.2d,v15.2d,v19.2d	\n\t"\
		/* Block 1:							Block 3: */\
		"ldr	w16,%[__i1]	\n\t	add x16,x16,x16	\n\t"/* Need x16 for __i2 here */\
		"add	x0,x16,x0		\n\t"/* __in0 + [2,6,10,14]*istride */\
		"add	x1,x16,x1		\n\t"\
		"add	x2,x16,x2		\n\t"\
		"add	x3,x16,x3		\n\t"\
		"add	x4,x16,x4		\n\t"/* __in0 + [3,7,11,16]*istride */\
		"add	x5,x16,x5		\n\t"\
		"add	x6,x16,x6		\n\t"\
		"add	x7,x16,x7		\n\t"/* Done with x16 for I-addressing */\
	/* Reload output base-address into x16: */\
		"ldr	x16,%[__out0]		\n\t"\
		"ldp	w8 ,w9 ,[x17,#0x20]	\n\t	ldp	w10,w11,[x17,#0x28]	\n\t"/* off8-b */\
		"ldp	q0 ,q1 ,[x0]	\n\t	add	x8 ,x16,x8 ,lsl #3	\n\t	stp	q16,q17,[x12]	\n\t"\
		"ldp	q2 ,q3 ,[x1]	\n\t	add	x9 ,x16,x9 ,lsl #3	\n\t	stp	q20,q21,[x13]	\n\t"\
		"ldp	q4 ,q5 ,[x2]	\n\t	add	x10,x16,x10,lsl #3	\n\t	stp	q23,q18,[x14]	\n\t"\
		"ldp	q10,q11,[x3]	\n\t	add	x11,x16,x11,lsl #3	\n\t	stp	q19,q22,[x15]	\n\t"\
											/* Block 3: */				/*     outc-f:     */\
		"ldp	w12,w13,[x17,#0x30]		\n\t	ldp	w14,w15,[x17,#0x38]		\n\t"/* offc-f */\
		"fmul	v8.2d,v4.2d,v30.2d	\n\t	ldp	q12,q13,[x4]	\n\t	add	x12,x16,x12,lsl #3	\n\t"\
		"fmul	v9.2d,v5.2d,v30.2d	\n\t	ldp	q14,q15,[x5]	\n\t	add	x13,x16,x13,lsl #3	\n\t"\
		"fmls	v8.2d,v5.2d,v31.2d	\n\t	ldp	q16,q17,[x6]	\n\t	add	x14,x16,x14,lsl #3	\n\t"\
		"fmla	v9.2d,v4.2d,v31.2d	\n\t	ldp	q18,q19,[x7]	\n\t	add	x15,x16,x15,lsl #3	\n\t"\
		"fmul	v6.2d,v10.2d,v31.2d	\n\t	fmul	v20.2d,v16.2d,v31.2d \n\t"\
		"fmul	v7.2d,v11.2d,v31.2d	\n\t	fmul	v21.2d,v17.2d,v31.2d \n\t"\
		"fmls	v6.2d,v11.2d,v30.2d	\n\t	fmls	v20.2d,v17.2d,v30.2d \n\t"\
		"fmla	v7.2d,v10.2d,v30.2d	\n\t	fmla	v21.2d,v16.2d,v30.2d \n\t"\
		"fsub	v4.2d,v8.2d,v6.2d	\n\t	fmul	v22.2d,v18.2d,v30.2d \n\t"\
		"fsub	v5.2d,v9.2d,v7.2d	\n\t	fmul	v23.2d,v19.2d,v30.2d \n\t"\
		"fadd	v6.2d,v8.2d,v6.2d	\n\t	fmls	v22.2d,v19.2d,v31.2d \n\t"\
		"fadd	v7.2d,v9.2d,v7.2d	\n\t	fmla	v23.2d,v18.2d,v31.2d \n\t"\
		"fsub	v8.2d,v2.2d,v3.2d	\n\t	fsub	v16.2d,v20.2d,v22.2d \n\t"\
		"fadd	v9.2d,v2.2d,v3.2d	\n\t	fsub	v17.2d,v21.2d,v23.2d \n\t"\
		"fmul	v8.2d,v8.2d,v29.2d	\n\t	fadd	v18.2d,v20.2d,v22.2d \n\t"\
		"fmul	v9.2d,v9.2d,v29.2d	\n\t	fadd	v19.2d,v21.2d,v23.2d \n\t"\
		"fadd	v2.2d,v0.2d,v8.2d	\n\t	fadd	v20.2d,v15.2d,v14.2d \n\t"\
		"fadd	v3.2d,v1.2d,v9.2d	\n\t	fsub	v21.2d,v15.2d,v14.2d \n\t"\
		"fsub	v0.2d,v0.2d,v8.2d	\n\t	fmul	v20.2d,v20.2d,v29.2d \n\t"\
		"fsub	v1.2d,v1.2d,v9.2d	\n\t	fmul	v21.2d,v21.2d,v29.2d \n\t"\
		"fadd	v8.2d,v2.2d,v6.2d	\n\t	fadd	v14.2d,v12.2d,v20.2d \n\t"\
		"fadd	v9.2d,v3.2d,v7.2d	\n\t	fadd	v15.2d,v13.2d,v21.2d \n\t"\
		"fsub	v2.2d,v2.2d,v6.2d	\n\t	fsub	v12.2d,v12.2d,v20.2d \n\t"\
		"fsub	v3.2d,v3.2d,v7.2d	\n\t	fsub	v13.2d,v13.2d,v21.2d \n\t"\
		"fadd	v10.2d,v0.2d,v5.2d	\n\t	fadd	v20.2d,v12.2d,v16.2d \n\t"\
		"fadd	v11.2d,v1.2d,v4.2d	\n\t	fadd	v21.2d,v13.2d,v17.2d \n\t"\
		"fsub	v0.2d,v0.2d,v5.2d	\n\t	fsub	v12.2d,v12.2d,v16.2d \n\t"\
		"fsub	v1.2d,v1.2d,v4.2d	\n\t	fsub	v13.2d,v13.2d,v17.2d \n\t"\
		"stp	q8 ,q9 ,[x8 ]		\n\t	fadd	v22.2d,v14.2d,v19.2d \n\t"\
		"stp	q2 ,q3 ,[x9 ]		\n\t	fadd	v23.2d,v15.2d,v18.2d \n\t"\
		"stp	q0 ,q11,[x10]		\n\t	fsub	v14.2d,v14.2d,v19.2d \n\t"\
		"stp	q10,q1 ,[x11]		\n\t	fsub	v15.2d,v15.2d,v18.2d \n\t"\
		"									stp	q20,q21,[x12]			\n\t"\
		"									stp	q12,q13,[x13]			\n\t"\
		"									stp	q14,q23,[x14]			\n\t"\
		"									stp	q22,q15,[x15]			\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i4] "m" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__off] "m" (Xoff)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__c1] "m" (Xc1)\
		: "cc","memory","x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15","x16","x17",\
			"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11",\
			"v12","v13","v14","v15","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25", "v29","v30","v31"	/* Clobbered registers */\
	);\
	}

#elif defined(USE_AVX512)	// Use AVX2/FMA3 macros as starting point for these

	// Cost: 12 DP-math, 17 vector MOV for each of the two side-by-side 3-DFTs
	#define SSE2_RADIX_03_DFT_X2(Xcc0, Xi0,Xi1,Xi2, Xo0,Xo1,Xo2, Xj0,Xj1,Xj2, Xu0,Xu1,Xu2)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax	\n\t	movq	%[__j0],%%r10	\n\t"\
		"movq	%[__i1],%%rbx	\n\t	movq	%[__j1],%%r11	\n\t"\
		"movq	%[__i2],%%rcx	\n\t	movq	%[__j2],%%r12	\n\t"\
		"movq	%[__cc0],%%rdx	\n\t							\n\t"\
		"vmovaps	    (%%rbx),%%zmm2		\n\t	vmovaps	    (%%r11),%%zmm10	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3		\n\t	vmovaps	0x40(%%r11),%%zmm11	\n\t"\
		"vmovaps	    (%%rax),%%zmm0		\n\t	vmovaps	    (%%r10),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1		\n\t	vmovaps	0x40(%%r10),%%zmm9 	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6		\n\t	vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7		\n\t	vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vmovaps	%%zmm2,%%zmm4			\n\t	vmovaps	%%zmm10,%%zmm12		\n\t"\
		"vmovaps	%%zmm3,%%zmm5			\n\t	vmovaps	%%zmm11,%%zmm13		\n\t"\
		"movq	%[__o0],%%rax				\n\t	movq	%[__u0],%%r10		\n\t"\
		"movq	%[__o1],%%rbx				\n\t	movq	%[__u1],%%r11		\n\t"\
		"movq	%[__o2],%%rcx				\n\t	movq	%[__u2],%%r12		\n\t"\
		"vaddpd	%%zmm6,%%zmm2,%%zmm2		\n\t	vaddpd	%%zmm14,%%zmm10,%%zmm10		\n\t"\
		"vaddpd	%%zmm7,%%zmm3,%%zmm3		\n\t	vaddpd	%%zmm15,%%zmm11,%%zmm11		\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4		\n\t	vsubpd	%%zmm14,%%zmm12,%%zmm12		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5		\n\t	vsubpd	%%zmm15,%%zmm13,%%zmm13		\n\t"\
		"vaddpd	%%zmm2,%%zmm0,%%zmm0		\n\t	vaddpd	%%zmm10,%%zmm8 ,%%zmm8 		\n\t"\
		"vaddpd	%%zmm3,%%zmm1,%%zmm1		\n\t	vaddpd	%%zmm11,%%zmm9 ,%%zmm9 		\n\t"\
		"vmovaps	    (%%rdx),%%zmm6		\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7		\n\t"\
		"vmovaps	%%zmm0,    (%%rax)		\n\t	vmovaps	%%zmm8 ,    (%%r10)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rax)		\n\t	vmovaps	%%zmm9 ,0x40(%%r10)	\n\t"\
	" vfmadd132pd	%%zmm6,%%zmm0,%%zmm2 	\n\t  vfmadd132pd	%%zmm6,%%zmm8,%%zmm10	\n\t"\
	" vfmadd132pd	%%zmm6,%%zmm1,%%zmm3 	\n\t  vfmadd132pd	%%zmm6,%%zmm9,%%zmm11	\n\t"\
		"vmovaps	%%zmm2,%%zmm0			\n\t	vmovaps	%%zmm10,%%zmm8 		\n\t"\
		"vmovaps	%%zmm3,%%zmm1			\n\t	vmovaps	%%zmm11,%%zmm9 		\n\t"\
	" vfmadd231pd	%%zmm7,%%zmm5,%%zmm0 	\n\t  vfmadd231pd	%%zmm7,%%zmm13,%%zmm8 	\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm4,%%zmm1 	\n\t vfnmadd231pd	%%zmm7,%%zmm12,%%zmm9 	\n\t"\
	"vfnmadd231pd	%%zmm7,%%zmm5,%%zmm2 	\n\t vfnmadd231pd	%%zmm7,%%zmm13,%%zmm10	\n\t"\
	" vfmadd231pd	%%zmm7,%%zmm4,%%zmm3 	\n\t  vfmadd231pd	%%zmm7,%%zmm12,%%zmm11	\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)		\n\t	vmovaps	%%zmm8 ,    (%%r12)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rcx)		\n\t	vmovaps	%%zmm9 ,0x40(%%r12)	\n\t"\
		"vmovaps	%%zmm2,    (%%rbx)		\n\t	vmovaps	%%zmm10,    (%%r11)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rbx)		\n\t	vmovaps	%%zmm11,0x40(%%r11)	\n\t"\
		:					/* outputs: none */\
		: [__cc0] "m" (Xcc0)	/* All inputs from memory addresses here */\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		: "cc","memory","rax","rbx","rcx","rdx","r10","r11","r12","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_03_DFT(Xi0,Xi1,Xi2, Xcc1, Xo0,Xo1,Xo2)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax				\n\t"\
		"movq	%[__i1],%%rbx				\n\t"\
		"movq	%[__i2],%%rcx				\n\t"\
		"movq	%[__cc1],%%rdx				\n\t"\
		"vmovaps	    (%%rbx),%%zmm2		\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3		\n\t"\
		"vmovaps	    (%%rax),%%zmm0		\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1		\n\t"\
		"vmovaps	    (%%rcx),%%zmm6		\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7		\n\t"\
		"vmovaps	%%zmm2,%%zmm4			\n\t"\
		"vmovaps	%%zmm3,%%zmm5			\n\t"\
		"movq	%[__o0],%%rax				\n\t"\
		"movq	%[__o1],%%rbx				\n\t"\
		"movq	%[__o2],%%rcx				\n\t"\
		"vaddpd	%%zmm6,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	    (%%rdx),%%zmm6		\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7		\n\t"\
		"vmulpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmulpd	%%zmm7,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm0,    (%%rax)		\n\t"\
		"vmovaps	%%zmm1,0x40(%%rax)		\n\t"\
		"vfmadd132pd %%zmm6,%%zmm0,%%zmm2	\n\t"\
		"vfmadd132pd %%zmm6,%%zmm1,%%zmm3	\n\t"\
		"vaddpd	%%zmm5,%%zmm2,%%zmm0		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm1		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)		\n\t"\
		"vmovaps	%%zmm1,0x40(%%rcx)		\n\t"\
		"vmovaps	%%zmm2,    (%%rbx)		\n\t"\
		"vmovaps	%%zmm3,0x40(%%rbx)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX4_DIF_0TWIDDLE_STRIDE(Xadd0, Xadd1, Xadd2, Xadd3, Xtmp, Xstride)\
	{\
	__asm__ volatile (\
		"movq	%[__tmp]   ,%%rax	\n\t"\
		"movq	%[__stride],%%rsi	\n\t"\
		"movq	%%rax,%%rbx			\n\t"\
		"addq	%%rsi,%%rbx			/* add_in1  */\n\t"\
		"shlq	$1,%%rsi			/* stride*2 */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vmovaps	    (%%rax),%%zmm4	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm5	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"addq	%%rsi,%%rax			/* add_in2  */\n\t"\
		"addq	%%rsi,%%rbx			/* add_in3  */\n\t"\
		"vaddpd	    (%%rax),%%zmm0,%%zmm0	\n\t"\
		"vaddpd	    (%%rbx),%%zmm2,%%zmm2	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm1,%%zmm1	\n\t"\
		"vaddpd	0x40(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	    (%%rbx),%%zmm6,%%zmm6	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vsubpd	0x40(%%rbx),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into main-array slots: */\n\t"\
		"movq	%[__add0],%%rax		\n\t"\
		"movq	%[__add1],%%rbx		\n\t"\
		"movq	%[__add2],%%rcx		\n\t"\
		"movq	%[__add3],%%rdx		\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm4,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%rdx)	\n\t"\
		"vaddpd	%%zmm2,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm0,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm4,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm5,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	%%zmm2,    (%%rax)	\n\t"\
		"vmovaps	%%zmm7,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%rcx)	\n\t"\
		:					/* outputs: none */\
		: [__add0] "m" (Xadd0)	/* All inputs from memory addresses here */\
		 ,[__add1] "m" (Xadd1)\
		 ,[__add2] "m" (Xadd2)\
		 ,[__add3] "m" (Xadd3)\
		 ,[__tmp] "m" (Xtmp)\
		 ,[__stride] "e" (Xstride)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	/* DIF radix-4 subconvolution, sans twiddles, inputs in __i0-3, outputs in __o0-3, possibly coincident with inputs: */
	#define SSE2_RADIX4_DIF_0TWIDDLE_STRIDE_E(Xi0,Xi1,Xi2,Xi3, Xo0,Xo1,Xo2,Xo3)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax		\n\t"\
		"movq	%[__i1],%%rbx		\n\t"\
		"movq	%[__i2],%%rcx		\n\t"\
		"movq	%[__i3],%%rdx		\n\t"\
		"vmovaps	    (%%rax),%%zmm0		\n\t"\
		"vmovaps	    (%%rbx),%%zmm4		\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1		\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5		\n\t"\
		"vmovaps	%%zmm0,%%zmm2			\n\t"\
		"vmovaps	%%zmm4,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm3			\n\t"\
		"vmovaps	%%zmm5,%%zmm7			\n\t"\
		"vaddpd	    (%%rcx),%%zmm0,%%zmm0	\n\t"\
		"vaddpd	    (%%rdx),%%zmm4,%%zmm4	\n\t"\
		"vaddpd	0x40(%%rcx),%%zmm1,%%zmm1	\n\t"\
		"vaddpd	0x40(%%rdx),%%zmm5,%%zmm5	\n\t"\
		"vsubpd	    (%%rcx),%%zmm2,%%zmm2	\n\t"\
		"vsubpd	    (%%rdx),%%zmm6,%%zmm6	\n\t"\
		"vsubpd	0x40(%%rcx),%%zmm3,%%zmm3	\n\t"\
		"vsubpd	0x40(%%rdx),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into main-array slots: */\n\t"\
		"movq	%[__o0],%%rax					\n\t"\
		"movq	%[__o1],%%rbx					\n\t"\
		"movq	%[__o2],%%rcx					\n\t"\
		"movq	%[__o3],%%rdx					\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)		\n\t"\
		"vmovaps	%%zmm2,    (%%rcx)		\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)		\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm2,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm3,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	%%zmm4,    (%%rax)		\n\t"\
		"vmovaps	%%zmm7,    (%%rdx)		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)		\n\t"\
		"vmovaps	%%zmm6,0x40(%%rcx)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX4_DIT_0TWIDDLE_STRIDE(Xadd0, Xadd1, Xadd2, Xadd3, Xtmp, Xstride)\
	{\
	__asm__ volatile (\
		"movq	%[__add0],%%rax		\n\t"\
		"movq	%[__add1],%%rbx		\n\t"\
		"movq	%[__add2],%%rcx		\n\t"\
		"movq	%[__add3],%%rdx		\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	%%zmm0,%%zmm2			\n\t"\
		"vmovaps	%%zmm4,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm3			\n\t"\
		"vmovaps	%%zmm5,%%zmm7			\n\t"\
		"movq	%[__tmp]   ,%%rax	\n\t"\
		"movq	%[__stride],%%rcx	\n\t"\
		"vaddpd	    (%%rbx),%%zmm0,%%zmm0	\n\t"\
		"vaddpd	    (%%rdx),%%zmm4,%%zmm4	\n\t"\
		"vaddpd	0x40(%%rbx),%%zmm1,%%zmm1	\n\t"\
		"vaddpd	0x40(%%rdx),%%zmm5,%%zmm5	\n\t"\
		"vsubpd	    (%%rbx),%%zmm2,%%zmm2	\n\t"\
		"vsubpd	    (%%rdx),%%zmm6,%%zmm6	\n\t"\
		"vsubpd	0x40(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vsubpd	0x40(%%rdx),%%zmm7,%%zmm7	\n\t"\
		"movq	%%rax,%%rbx			\n\t"\
		"addq	%%rcx,%%rbx			\n\t"\
		"movq	%%rbx,%%rdx			\n\t"\
		"addq	%%rcx,%%rcx			\n\t"\
		"addq	%%rcx,%%rdx			\n\t"\
		"addq	%%rax,%%rcx			\n\t"\
		"/* Finish radix-4 butterfly and store results into temp-array slots: */\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0			\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2			\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1			\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3			\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rcx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rbx)	\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4			\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7			\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5			\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6			\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4			\n\t"\
		"vaddpd	%%zmm2,%%zmm7,%%zmm7			\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5			\n\t"\
		"vaddpd	%%zmm3,%%zmm6,%%zmm6			\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	%%zmm7,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%rdx)	\n\t"\
		:					/* outputs: none */\
		: [__add0] "m" (Xadd0)	/* All inputs from memory addresses here */\
		 ,[__add1] "m" (Xadd1)\
		 ,[__add2] "m" (Xadd2)\
		 ,[__add3] "m" (Xadd3)\
		 ,[__tmp] "m" (Xtmp)\
		 ,[__stride] "e" (Xstride)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	/* DIT radix-4 subconvolution, sans twiddles, inputs in __i0-3, outputs in __o0-3, possibly coincident with inputs: */
	#define SSE2_RADIX4_DIT_0TWIDDLE_STRIDE_E(Xi0,Xi1,Xi2,Xi3, Xo0,Xo1,Xo2,Xo3)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax		\n\t"\
		"movq	%[__i1],%%rbx		\n\t"\
		"movq	%[__i2],%%rcx		\n\t"\
		"movq	%[__i3],%%rdx		\n\t"\
		"vmovaps	    (%%rax),%%zmm0		\n\t"\
		"vmovaps	    (%%rcx),%%zmm4		\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1		\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5		\n\t"\
		"vmovaps	%%zmm0,%%zmm2			\n\t"\
		"vmovaps	%%zmm4,%%zmm6			\n\t"\
		"vmovaps	%%zmm1,%%zmm3			\n\t"\
		"vmovaps	%%zmm5,%%zmm7			\n\t"\
		"vaddpd	    (%%rbx),%%zmm0,%%zmm0	\n\t"\
		"vaddpd	    (%%rdx),%%zmm4,%%zmm4	\n\t"\
		"vaddpd	0x40(%%rbx),%%zmm1,%%zmm1	\n\t"\
		"vaddpd	0x40(%%rdx),%%zmm5,%%zmm5	\n\t"\
		"vsubpd	    (%%rbx),%%zmm2,%%zmm2	\n\t"\
		"vsubpd	    (%%rdx),%%zmm6,%%zmm6	\n\t"\
		"vsubpd	0x40(%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vsubpd	0x40(%%rdx),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into output-array slots: */\n\t"\
		"movq	%[__o0],%%rax					\n\t"\
		"movq	%[__o1],%%rbx					\n\t"\
		"movq	%[__o2],%%rcx					\n\t"\
		"movq	%[__o3],%%rdx					\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)		\n\t"\
		"vmovaps	%%zmm2,    (%%rdx)		\n\t"\
		"vmovaps	%%zmm1,0x40(%%rcx)		\n\t"\
		"vmovaps	%%zmm3,0x40(%%rbx)		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm2,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm3,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	%%zmm4,    (%%rax)		\n\t"\
		"vmovaps	%%zmm7,    (%%rbx)		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)		\n\t"\
		"vmovaps	%%zmm6,0x40(%%rdx)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_05_DFT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4, Xcc1, Xo0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
		"movq $0x4000000000000000,%%rdi	\n\t	vpbroadcastq  %%rdi,%%zmm31	\n\t"/* 2.0 */\
		"movq	%[__i0],%%rsi		\n\t"\
		"movq	%[__i1],%%rax		\n\t"\
		"movq	%[__i2],%%rbx		\n\t"\
		"movq	%[__i3],%%rcx		\n\t"\
		"movq	%[__i4],%%rdx		\n\t"\
		"movq	%[__o0],%%rdi		\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0	\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm0,%%zmm6	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm1,%%zmm7	\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2	\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm2,%%zmm4	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm3,%%zmm5	\n\t"\
	"movq	%[__cc1],%%rax		\n\t"\
		"vsubpd	%%zmm4,%%zmm6,%%zmm6	\n\t"\
		"vsubpd	%%zmm5,%%zmm7,%%zmm7	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm6,%%zmm4	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm7,%%zmm5	\n\t"\
		"vaddpd	    (%%rsi),%%zmm4,%%zmm8	\n\t"/* += X0 */\
		"vaddpd	0x40(%%rsi),%%zmm5,%%zmm9	\n\t"\
	/* No FMA here, since need resulting MM6,7 as args to several ops below: */\
		"vmulpd	0x40(%%rax),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"vmovaps	%%zmm8,    (%%rdi)	\n\t"/* Write DC term */\
		"vmovaps	%%zmm9,0x40(%%rdi)	\n\t"\
	"vfmadd132pd (%%rax),%%zmm8,%%zmm4	\n\t"\
	"vfmadd132pd (%%rax),%%zmm9,%%zmm5	\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4	\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm4,%%zmm6	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm5,%%zmm7	\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm8	\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm9	\n\t"\
	/* No FMA here, since need resulting MM8,9 as args to several ops below: */\
		"vmulpd	0x80(%%rax),%%zmm8,%%zmm8	\n\t"\
		"vmulpd	0x80(%%rax),%%zmm9,%%zmm9	\n\t"\
	" vfmadd132pd 0x0c0(%%rax),%%zmm8,%%zmm2\n\t"\
	" vfmadd132pd 0x0c0(%%rax),%%zmm9,%%zmm3\n\t"\
	"vfnmadd231pd 0x100(%%rax),%%zmm0,%%zmm8\n\t"\
	"vfnmadd231pd 0x100(%%rax),%%zmm1,%%zmm9\n\t"\
	"movq	%[__o1],%%rax		\n\t"\
	"movq	%[__o4],%%rdx		\n\t"\
		"vsubpd	%%zmm3,%%zmm6,%%zmm6	\n\t"\
		"vsubpd	%%zmm2,%%zmm7,%%zmm7	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm6,%%zmm3	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm7,%%zmm2	\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rdx)	\n\t"\
		"vmovaps	%%zmm3,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm2,0x40(%%rax)	\n\t"\
	"movq	%[__o2],%%rbx		\n\t"\
	"movq	%[__o3],%%rcx		\n\t"\
		"vsubpd	%%zmm9,%%zmm4,%%zmm4	\n\t"\
		"vsubpd	%%zmm8,%%zmm5,%%zmm5	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm4,%%zmm9	\n\t"\
	"vfmadd132pd %%zmm31,%%zmm5,%%zmm8	\n\t"\
		"vmovaps	%%zmm4,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%rcx)	\n\t"\
		"vmovaps	%%zmm9,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm8,0x40(%%rbx)	\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm31"	/* Clobbered registers */\
	);\
	}

	// Cost: 38 DP-math, 31 vector MOV for each of the two side-by-side 5-DFTs
	#define SSE2_RADIX_05_DFT_0TWIDDLE_X2(Xcc1,Xtwo, Xi0,Xi1,Xi2,Xi3,Xi4, Xo0,Xo1,Xo2,Xo3,Xo4, Xj0,Xj1,Xj2,Xj3,Xj4, Xu0,Xu1,Xu2,Xu3,Xu4)\
	{\
	__asm__ volatile (\
		"movq	%[__i1],%%rax				\n\t	movq	%[__j1],%%r11			\n\t"\
		"movq	%[__i4],%%rdx				\n\t	movq	%[__j4],%%r14			\n\t"\
		"vmovaps	    (%%rax),%%zmm0		\n\t	vmovaps	    (%%r11),%%zmm8 		\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1		\n\t	vmovaps	0x40(%%r11),%%zmm9 		\n\t"\
		"vmovaps	    (%%rdx),%%zmm6		\n\t	vmovaps	    (%%r14),%%zmm14		\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7		\n\t	vmovaps	0x40(%%r14),%%zmm15		\n\t"\
		"movq	%[__i2],%%rbx				\n\t	movq	%[__j2],%%r12			\n\t"\
		"movq	%[__i3],%%rcx				\n\t	movq	%[__j3],%%r13			\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t	vsubpd	%%zmm14,%%zmm8 ,%%zmm8 	\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t	vsubpd	%%zmm15,%%zmm9 ,%%zmm9 	\n\t"\
		"vmovaps	    (%%rbx),%%zmm2		\n\t	vmovaps	    (%%r12),%%zmm10		\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3		\n\t	vmovaps	0x40(%%r12),%%zmm11		\n\t"\
		"vmovaps	    (%%rcx),%%zmm4		\n\t	vmovaps	    (%%r13),%%zmm12		\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5		\n\t	vmovaps	0x40(%%r13),%%zmm13		\n\t"\
	"movq		%[__two],%%rcx				\n\t	vmovaps	(%%rcx),%%zmm31			\n\t"/* two */\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t	vsubpd		%%zmm12,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t	vsubpd		%%zmm13,%%zmm11,%%zmm11		\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm0,%%zmm6 	\n\t  vfmadd132pd		%%zmm31,%%zmm8,%%zmm14	\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm1,%%zmm7 	\n\t  vfmadd132pd		%%zmm31,%%zmm9,%%zmm15	\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm2,%%zmm4 	\n\t  vfmadd132pd		%%zmm31,%%zmm10,%%zmm12	\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm3,%%zmm5 	\n\t  vfmadd132pd		%%zmm31,%%zmm11,%%zmm13	\n\t"\
	/*==== spill zmm2,3 here (still use once as dest of add/sub below, but pvsly carried copies of values-here around) =====*/\
	"vmovaps	%%zmm2,    (%%rbx)		\n\t	vmovaps	%%zmm10,    (%%r12)		\n\t"\
	"vmovaps	%%zmm3,0x40(%%rbx)		\n\t	vmovaps	%%zmm11,0x40(%%r12)		\n\t"\
		"movq	%[__cc1],%%rax				\n\t"\
		"vsubpd	%%zmm4,%%zmm6,%%zmm6		\n\t	vsubpd	%%zmm12,%%zmm14,%%zmm14		\n\t"\
		"vsubpd	%%zmm5,%%zmm7,%%zmm7		\n\t	vsubpd	%%zmm13,%%zmm15,%%zmm15		\n\t"\
		"movq	%[__i0],%%rsi				\n\t	movq	%[__j0],%%r10		\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm6,%%zmm4 	\n\t  vfmadd132pd	%%zmm31,%%zmm14,%%zmm12	\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm7,%%zmm5 	\n\t  vfmadd132pd	%%zmm31,%%zmm15,%%zmm13	\n\t"\
		"vaddpd	    (%%rsi),%%zmm4,%%zmm2	\n\t	vaddpd	    (%%r10),%%zmm12,%%zmm10	\n\t"\
		"vaddpd	0x40(%%rsi),%%zmm5,%%zmm3	\n\t	vaddpd	0x40(%%r10),%%zmm13,%%zmm11	\n\t"\
		"movq	%[__o0],%%rdi				\n\t	movq	%[__u0],%%r15		\n\t"\
		"vmovaps	%%zmm2,    (%%rdi)		\n\t	vmovaps	%%zmm10,    (%%r15)		\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdi)		\n\t	vmovaps	%%zmm11,0x40(%%r15)		\n\t"\
	"vmovaps	    (%%rax),%%zmm2	\n\t"/* each of these 2 mults used 4x, so trade 8 loads for 4: */\
	"vmovaps	0x40(%%rax),%%zmm3	\n\t"/* 2 of mults, 2 to read clobbered zmm2,3 values from mem */\
		"vmulpd			%%zmm3 ,%%zmm6,%%zmm6	\n\t	vmulpd		%%zmm3,%%zmm14,%%zmm14	\n\t"\
		"vmulpd			%%zmm3 ,%%zmm7,%%zmm7	\n\t	vmulpd		%%zmm3,%%zmm15,%%zmm15	\n\t"\
	" vfmadd213pd	    (%%rdi),%%zmm2,%%zmm4 	\n\t  vfmadd132pd	%%zmm2,%%zmm10,%%zmm12	\n\t"\
	" vfmadd213pd	0x40(%%rdi),%%zmm2,%%zmm5 	\n\t  vfmadd132pd	%%zmm2,%%zmm11,%%zmm13	\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4		\n\t	vsubpd	%%zmm14,%%zmm12,%%zmm12		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5		\n\t	vsubpd	%%zmm15,%%zmm13,%%zmm13		\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm4,%%zmm6 	\n\t  vfmadd132pd	%%zmm31,%%zmm12,%%zmm14	\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm5,%%zmm7 	\n\t  vfmadd132pd	%%zmm31,%%zmm13,%%zmm15	\n\t"\
		"vmovaps	%%zmm4,    (%%rsi)		\n\t	vmovaps	%%zmm12,    (%%r10)		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rsi)		\n\t	vmovaps	%%zmm13,0x40(%%r10)		\n\t"\
	/*==== restore spill of zmm2,3 here =====*/\
	"vmovaps	    (%%rbx),%%zmm2		\n\t	vmovaps	    (%%r12),%%zmm10		\n\t"\
	"vmovaps	0x40(%%rbx),%%zmm3		\n\t	vmovaps	0x40(%%r12),%%zmm11		\n\t"\
	/*==== spill zmm0,1 here (still use once as mult in 2 of the 8 FMAs below, but now read those values from the spill addrs) =====*/\
	"vmovaps	0x80(%%rax),%%zmm4	\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)		\n\t	vmovaps	%%zmm8 ,%%zmm12			\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)		\n\t	vmovaps	%%zmm9 ,%%zmm13			\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm0		\n\t	vsubpd	%%zmm10,%%zmm8,%%zmm8 	\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm1		\n\t	vsubpd	%%zmm11,%%zmm9,%%zmm9 	\n\t"\
		"vmulpd	%%zmm4,%%zmm0,%%zmm0		\n\t	vmulpd	%%zmm4 ,%%zmm8,%%zmm8 	\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1		\n\t	vmulpd	%%zmm4 ,%%zmm9,%%zmm9 	\n\t"\
	"vmovaps	0x0c0(%%rax),%%zmm4	\n\t"/* each of these 2 mults used 4x, so trade 8 loads for 4: */\
	"vmovaps	0x100(%%rax),%%zmm5	\n\t"/* 2 of mults, 2 to read clobbered zmm2,3 values from mem */\
	" vfmadd132pd		%%zmm4 ,%%zmm0,%%zmm2 	\n\t  vfmadd132pd	%%zmm4 ,%%zmm8 ,%%zmm10	\n\t"\
	" vfmadd132pd		%%zmm4 ,%%zmm1,%%zmm3 	\n\t  vfmadd132pd	%%zmm4 ,%%zmm9 ,%%zmm11	\n\t"\
	"vfnmadd231pd	    (%%rbx),%%zmm5,%%zmm0 	\n\t vfnmadd231pd	%%zmm5 ,%%zmm12,%%zmm8 	\n\t"\
	"vfnmadd231pd	0x40(%%rbx),%%zmm5,%%zmm1 	\n\t vfnmadd231pd	%%zmm5 ,%%zmm13,%%zmm9 	\n\t"\
		"vmovaps	    (%%rsi),%%zmm4		\n\t	vmovaps	    (%%r10),%%zmm12		\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm5		\n\t	vmovaps	0x40(%%r10),%%zmm13		\n\t"\
		"movq	%[__o1],%%rax				\n\t	movq	%[__u1],%%r11			\n\t"\
		"movq	%[__o4],%%rdx				\n\t	movq	%[__u4],%%r14			\n\t"\
		"vsubpd	%%zmm3,%%zmm6,%%zmm6		\n\t	vsubpd	%%zmm11,%%zmm14,%%zmm14		\n\t"\
		"vsubpd	%%zmm2,%%zmm7,%%zmm7		\n\t	vsubpd	%%zmm10,%%zmm15,%%zmm15		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)		\n\t	vmovaps	%%zmm14,    (%%r11)		\n\t"\
		"vmovaps	%%zmm7,0x40(%%rdx)		\n\t	vmovaps	%%zmm15,0x40(%%r14)		\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm6,%%zmm3 	\n\t  vfmadd132pd	%%zmm31,%%zmm14,%%zmm11	\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm7,%%zmm2 	\n\t  vfmadd132pd	%%zmm31,%%zmm15,%%zmm10	\n\t"\
		"vmovaps	%%zmm3,    (%%rdx)		\n\t	vmovaps	%%zmm11,    (%%r14)		\n\t"\
		"vmovaps	%%zmm2,0x40(%%rax)		\n\t	vmovaps	%%zmm10,0x40(%%r11)		\n\t"\
		"movq	%[__o2],%%rbx				\n\t	movq	%[__u2],%%r12			\n\t"\
		"movq	%[__o3],%%rcx				\n\t	movq	%[__u3],%%r13			\n\t"\
		"vsubpd	%%zmm1,%%zmm4,%%zmm4		\n\t	vsubpd	%%zmm9 ,%%zmm12,%%zmm12		\n\t"\
		"vsubpd	%%zmm0,%%zmm5,%%zmm5		\n\t	vsubpd	%%zmm8 ,%%zmm13,%%zmm13		\n\t"\
		"vmovaps	%%zmm4,    (%%rbx)		\n\t	vmovaps	%%zmm12,    (%%r12)		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rcx)		\n\t	vmovaps	%%zmm13,0x40(%%r13)		\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm4,%%zmm1 	\n\t  vfmadd132pd	%%zmm31,%%zmm12,%%zmm9	\n\t"\
	" vfmadd132pd	%%zmm31,%%zmm5,%%zmm0 	\n\t  vfmadd132pd	%%zmm31,%%zmm13,%%zmm8	\n\t"\
		"vmovaps	%%zmm1,    (%%rcx)		\n\t	vmovaps	%%zmm9 ,    (%%r13)		\n\t"\
		"vmovaps	%%zmm0,0x40(%%rbx)		\n\t	vmovaps	%%zmm8 ,0x40(%%r12)		\n\t"\
		:					/* outputs: none */\
		: [__cc1] "m" (Xcc1)	/* All inputs from memory addresses here */\
		 ,[__two] "m" (Xtwo)\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__j3] "m" (Xj3)\
		 ,[__j4] "m" (Xj4)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		 ,[__u3] "m" (Xu3)\
		 ,[__u4] "m" (Xu4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm31"		/* Clobbered registers */\
	);\
	}

	// AVX -> FMA version: replace [88 ADD, 16 MUL, 54 memref] ==> [18 ADD, 6 MUL, 42 FMA, 58 memref].
	// I.e. trade [88 ADD, 10 MUL] for 42 FMA. FMA version also better at preserving floating-point accuracy.
	//
	#define SSE2_RADIX_07_DFT(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6, Xcc,Xtwo, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%r8		\n\t"\
		"movq	%[__i1],%%rax		\n\t"\
		"movq	%[__i2],%%rbx		\n\t"\
		"movq	%[__i3],%%rcx		\n\t"\
		"movq	%[__i4],%%rdx		\n\t"\
		"movq	%[__i5],%%rsi		\n\t"\
		"movq	%[__i6],%%rdi		\n\t"		/*** Rcol does Imaginary Parts: ***/\
		"vmovaps	(%%rax),%%zmm6		\n\t	vmovaps	0x40(%%rax),%%zmm14	\n\t"	/* x1 */\
		"vmovaps	(%%rdi),%%zmm1		\n\t	vmovaps	0x40(%%rdi),%%zmm9 	\n\t"	/* x6 */\
		"vmovaps	(%%rbx),%%zmm5		\n\t	vmovaps	0x40(%%rbx),%%zmm13	\n\t"	/* x2 */\
		"vmovaps	(%%rsi),%%zmm2		\n\t	vmovaps	0x40(%%rsi),%%zmm10	\n\t"	/* x5 */\
		"vmovaps	(%%rcx),%%zmm4		\n\t	vmovaps	0x40(%%rcx),%%zmm12	\n\t"	/* x3 */\
		"vmovaps	(%%rdx),%%zmm3		\n\t	vmovaps	0x40(%%rdx),%%zmm11	\n\t"	/* x4 */\
		"vmovaps	(%%r8),%%zmm0		\n\t"/* two */\
		"movq	%[__i0],%%rdi		\n\t"\
		"vsubpd	%%zmm1,%%zmm6,%%zmm6	\n\t	vsubpd	%%zmm9 ,%%zmm14,%%zmm14		\n\t"	/* t6 = x1 - x6 */\
		"vsubpd	%%zmm2,%%zmm5,%%zmm5	\n\t	vsubpd	%%zmm10,%%zmm13,%%zmm13		\n\t"	/* t5 = x2 - x5 */\
		"vsubpd	%%zmm3,%%zmm4,%%zmm4	\n\t	vsubpd	%%zmm11,%%zmm12,%%zmm12		\n\t"	/* t4 = x3 - x4 */\
	"vfmadd132pd %%zmm0,%%zmm6,%%zmm1	\n\t vfmadd132pd %%zmm0,%%zmm14,%%zmm9 		\n\t"	/* t1 = x1 + x6 */\
	"vfmadd132pd %%zmm0,%%zmm5,%%zmm2	\n\t vfmadd132pd %%zmm0,%%zmm13,%%zmm10		\n\t"	/* t2 = x2 + x5 */\
	"vfmadd132pd %%zmm0,%%zmm4,%%zmm3	\n\t vfmadd132pd %%zmm0,%%zmm12,%%zmm11		\n\t"	/* t3 = x3 + x4 */\
		"vmovaps	(%%rdi),%%zmm0		\n\t	vmovaps	0x40(%%rdi),%%zmm8 	\n\t"	/* t0 = x0 */\
		"movq	%[__o1],%%rax		\n\t"\
		"movq	%[__o2],%%rbx		\n\t"\
		"movq	%[__o3],%%rcx		\n\t"\
	/* Spill  xi - xj combos to o-slots; these won't be needed until we get to the sine terms: */\
		"vmovaps	%%zmm6,    (%%rax)	\n\t	vmovaps	%%zmm14,0x40(%%rax)	\n\t"/* t6 */\
		"vmovaps	%%zmm5,    (%%rbx)	\n\t	vmovaps	%%zmm13,0x40(%%rbx)	\n\t"/* t5 */\
		"vmovaps	%%zmm4,    (%%rcx)	\n\t	vmovaps	%%zmm12,0x40(%%rcx)	\n\t"/* t4 */\
		"vmovaps	%%zmm0,%%zmm6		\n\t	vmovaps	%%zmm8 ,%%zmm14		\n\t"/* Br0 = t0 (only show real parts in comments) */\
		"vmovaps	%%zmm0,%%zmm5		\n\t	vmovaps	%%zmm8 ,%%zmm13		\n\t"/* rt  = t0 */\
		"vmovaps	%%zmm0,%%zmm4		\n\t	vmovaps	%%zmm8 ,%%zmm12		\n\t"/* re  = t0 */\
\
		"movq	%[__cc],%%rsi		\n\t"\
		"movq	%[__o0],%%rdi		\n\t"\
		"vmovaps	0x80(%%rsi),%%zmm7	\n\t	vmovaps	0x100(%%rsi),%%zmm15			\n\t"/* cc2,cc3 */\
	"vfmadd231pd (%%rsi),%%zmm1,%%zmm5	\n\t vfmadd231pd (%%rsi),%%zmm9 ,%%zmm13	\n\t"/* rt  = FMADD(cc1,tr1, rt ); */\
	"vfmadd231pd %%zmm7 ,%%zmm1,%%zmm4	\n\t vfmadd231pd %%zmm7 ,%%zmm9 ,%%zmm12	\n\t"/* re  = FMADD(cc2,tr1, re ); */\
	"vfmadd231pd %%zmm15,%%zmm1,%%zmm0	\n\t vfmadd231pd %%zmm15,%%zmm9 ,%%zmm8 	\n\t"/* tr0 = FMADD(cc3,tr1, tr0); */\
		"vaddpd	%%zmm1,%%zmm6,%%zmm6	\n\t	vaddpd	%%zmm9 ,%%zmm14,%%zmm14		\n\t"/* Br0 += tr1; */\
\
	"vfmadd231pd %%zmm7 ,%%zmm2,%%zmm5	\n\t vfmadd231pd %%zmm7 ,%%zmm10,%%zmm13	\n\t"/* rt  = FMADD(cc2,tr2, rt ); */\
	"vfmadd231pd %%zmm15,%%zmm2,%%zmm4	\n\t vfmadd231pd %%zmm15,%%zmm10,%%zmm12	\n\t"/* re  = FMADD(cc3,tr2, re ); */\
	"vfmadd231pd (%%rsi),%%zmm2,%%zmm0	\n\t vfmadd231pd (%%rsi),%%zmm10,%%zmm8 	\n\t"/* tr0 = FMADD(cc1,tr2, tr0); */\
		"vaddpd	%%zmm2,%%zmm6,%%zmm6	\n\t	vaddpd	%%zmm10,%%zmm14,%%zmm14		\n\t"/* Br0 += tr2; */\
\
	"vfmadd231pd %%zmm15,%%zmm3,%%zmm5	\n\t vfmadd231pd %%zmm15,%%zmm11,%%zmm13	\n\t"/* rt  = FMADD(cc3,tr3, rt ); */\
	"vfmadd231pd (%%rsi),%%zmm3,%%zmm4	\n\t vfmadd231pd (%%rsi),%%zmm11,%%zmm12	\n\t"/* re  = FMADD(cc1,tr3, re ); */\
	"vfmadd231pd %%zmm7 ,%%zmm3,%%zmm0	\n\t vfmadd231pd %%zmm7 ,%%zmm11,%%zmm8 	\n\t"/* tr0 = FMADD(cc2,tr3, tr0); */\
		"vaddpd	%%zmm3,%%zmm6,%%zmm6	\n\t	vaddpd	%%zmm11,%%zmm14,%%zmm14		\n\t"/* Br0 += tr3; */\
		"vmovaps	%%zmm6,    (%%rdi)	\n\t	vmovaps	%%zmm14,0x40(%%rdi)	\n\t"/* B0 */\
\
		"addq	$0x40,%%rsi		\n\t"/* Incr trig ptr: cc0 -> ss0 */\
		"vmovaps	0x80(%%rsi),%%zmm7	\n\t	vmovaps	0x100(%%rsi),%%zmm15		\n\t"/* ss2,ss3 */\
		"vmovaps		(%%rax),%%zmm1	\n\t	vmovaps	0x40(%%rax),%%zmm9 		\n\t"/* Restore: tr1 = tr6 */\
		"vmovaps		%%zmm1 ,%%zmm2	\n\t	vmovaps		%%zmm9 ,%%zmm10		\n\t"/* tr2 = tr6 */\
		"vmovaps		%%zmm1 ,%%zmm3	\n\t	vmovaps		%%zmm9 ,%%zmm11		\n\t"/* tr3 = tr6 */\
		"vmulpd (%%rsi),%%zmm1,%%zmm1	\n\t	vmulpd (%%rsi),%%zmm9 ,%%zmm9 	\n\t"/* tr1 = ss1*tr6; */\
		"vmulpd %%zmm7 ,%%zmm2,%%zmm2	\n\t	vmulpd %%zmm7 ,%%zmm10,%%zmm10	\n\t"/* tr2 = ss2*tr6; */\
		"vmulpd %%zmm15,%%zmm3,%%zmm3	\n\t	vmulpd %%zmm15,%%zmm11,%%zmm11	\n\t"/* tr3 = ss3*tr6; */\
\
		"vmovaps		(%%rbx),%%zmm6	\n\t	vmovaps	0x40(%%rbx),%%zmm14			\n\t"/* Restore t5 */\
	" vfmadd231pd %%zmm7 ,%%zmm6,%%zmm1	\n\t  vfmadd231pd %%zmm7 ,%%zmm14,%%zmm9 	\n\t"/* tr1 =  FMADD(ss2,tr5, tr1); */\
	"vfnmadd231pd %%zmm15,%%zmm6,%%zmm2	\n\t vfnmadd231pd %%zmm15,%%zmm14,%%zmm10	\n\t"/* tr2 = FNMADD(ss3,tr5, tr2); */\
	"vfnmadd231pd (%%rsi),%%zmm6,%%zmm3	\n\t vfnmadd231pd (%%rsi),%%zmm14,%%zmm11	\n\t"/* tr3 = FNMADD(ss1,tr5, tr3); */\
\
		"vmovaps		(%%rcx),%%zmm6	\n\t	vmovaps	0x40(%%rcx),%%zmm14			\n\t"/* Restore t4 */\
	" vfmadd231pd %%zmm15,%%zmm6,%%zmm1	\n\t  vfmadd231pd %%zmm15,%%zmm14,%%zmm9 	\n\t"/* tr1 =  FMADD(ss3,tr4, tr1); */\
	"vfnmadd231pd (%%rsi),%%zmm6,%%zmm2	\n\t vfnmadd231pd (%%rsi),%%zmm14,%%zmm10	\n\t"/* tr2 = FNMADD(ss1,tr4, tr2); */\
	" vfmadd231pd %%zmm7 ,%%zmm6,%%zmm3	\n\t  vfmadd231pd %%zmm7 ,%%zmm14,%%zmm11	\n\t"/* tr3 =  FMADD(ss2,tr4, tr3); */\
\
		"\n\t"\
		"movq	%[__o4],%%rdx		\n\t"\
		"movq	%[__o5],%%rsi		\n\t"\
		"movq	%[__o6],%%rdi		\n\t"\
		"vmovaps	(%%r8),%%zmm6	\n\t"/* two */\
	/* Output permutation causes signs to get flipped here: */\
		"vsubpd	%%zmm9 ,%%zmm5,%%zmm5		\n\t	vsubpd	%%zmm1 ,%%zmm13,%%zmm13	\n\t"/* Br1 = rt  - ti1;	Bi6 = it  - tr1; */\
		"vsubpd	%%zmm10,%%zmm4,%%zmm4		\n\t	vsubpd	%%zmm2 ,%%zmm12,%%zmm12	\n\t"/* Br2 = re  - ti2;	Bi5 = im  - tr2; */\
		"vsubpd	%%zmm11,%%zmm0,%%zmm0		\n\t	vsubpd	%%zmm3 ,%%zmm8 ,%%zmm8 	\n\t"/* Br3 = tr0 - ti3;	Bi4 = ti0 - tr3; */\
	"vfmadd132pd %%zmm6,%%zmm5,%%zmm9 		\n\t vfmadd132pd %%zmm6,%%zmm13,%%zmm1 	\n\t"/* Br6 = rt  + ti1;	Bi1 = it  + tr1; */\
	"vfmadd132pd %%zmm6,%%zmm4,%%zmm10		\n\t vfmadd132pd %%zmm6,%%zmm12,%%zmm2 	\n\t"/* Br5 = re  + ti2;	Bi2 = im  + tr2; */\
	"vfmadd132pd %%zmm6,%%zmm0,%%zmm11		\n\t vfmadd132pd %%zmm6,%%zmm8 ,%%zmm3 	\n\t"/* Br4 = tr0 + ti3;	Bi3 = ti0 + tr3; */\
		"vmovaps	%%zmm5	,   (%%rax)		\n\t	vmovaps	%%zmm13,0x40(%%rdi)	\n\t"/* Br1,Bi6 */\
		"vmovaps	%%zmm4	,   (%%rbx)		\n\t	vmovaps	%%zmm12,0x40(%%rsi)	\n\t"/* Br2,Bi5 */\
		"vmovaps	%%zmm0	,   (%%rcx)		\n\t	vmovaps	%%zmm8 ,0x40(%%rdx)	\n\t"/* Br3,Bi4 */\
		"vmovaps	%%zmm9 	,   (%%rdi)		\n\t	vmovaps	%%zmm1 ,0x40(%%rax)	\n\t"/* Br6,Bi1 */\
		"vmovaps	%%zmm10	,   (%%rsi)		\n\t	vmovaps	%%zmm2 ,0x40(%%rbx)	\n\t"/* Br5,Bi2 */\
		"vmovaps	%%zmm11	,   (%%rdx)		\n\t	vmovaps	%%zmm3 ,0x40(%%rcx)	\n\t"/* Br4,Bi3 */\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__cc] "m" (Xcc)\
		 ,[__two] "m" (Xtwo)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	/* Twiddleless version of SSE2_RADIX8_DIF_TWIDDLE. Inputs enter in memory locations __r0 + [__i1,__i2,__i3,__i4,__i5,__i6,__i7],;
	where r0 is a memory address and the i's are LITERAL [BYTE] OFFSETS. Outputs go into memory locations __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7, assumed disjoint with inputs:\
	*/
	#define SSE2_RADIX8_DIF_0TWIDDLE(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2,Xtwo)\
	{\
	__asm__ volatile (\
	/* 1st of 2 radix-4 subtransforms, data in zmm0-7: */	/* 2nd of 2 radix-4 subtransforms, data in zmm8-15: */\
		"movq	%[__isrt2],%%rsi				\n\t		movq	%[__two],%%r9	\n\t"/* r9 holds 2.0 throughout */\
		"movq	%[__r0],%%rax	/* i0 = r00 */	\n\t		leaq	%c[__i1](%%rax),%%r10	/* i1 */\n\t"\
		"leaq	%c[__i2](%%rax),%%rbx			\n\t		leaq	%c[__i3](%%rax),%%r11	/* i3 */\n\t"\
		"leaq	%c[__i4](%%rax),%%rcx			\n\t		leaq	%c[__i5](%%rax),%%r12	/* i5 */\n\t"\
		"leaq	%c[__i6](%%rax),%%rdx			\n\t		leaq	%c[__i7](%%rax),%%r13	/* i7 */\n\t"\
	/* p0,4 combo: x+-y into zmm0/1, 2/3, resp: */		/* p1,5 combo: x+y into zmm8 /1, x-y in zmm10/3: */\
	/* p2,6 combo: x+-y into zmm4/5, 6/7, resp: */		/* p3,7 combo: x+y into zmm14/7, x-y in zmm12/5: */\
		"vmovaps	     (%%rcx),%%zmm0			\n\t		vmovaps	     (%%r12),%%zmm8 			\n\t"\
		"vmovaps	0x040(%%rcx),%%zmm1			\n\t		vmovaps	0x040(%%r12),%%zmm9 			\n\t"\
		"vmovaps	     (%%rax),%%zmm2			\n\t		vmovaps	     (%%r10),%%zmm10			\n\t"\
		"vmovaps	0x040(%%rax),%%zmm3			\n\t		vmovaps	0x040(%%r10),%%zmm11			\n\t"\
		"vmovaps	     (%%rdx),%%zmm4			\n\t		vmovaps	     (%%r11),%%zmm12			\n\t"\
		"vmovaps	0x040(%%rdx),%%zmm5			\n\t		vmovaps	0x040(%%r11),%%zmm13			\n\t"\
		"vmovaps	     (%%rbx),%%zmm6			\n\t		vmovaps	     (%%r13),%%zmm14			\n\t"\
		"vmovaps	0x040(%%rbx),%%zmm7			\n\t		vmovaps	0x040(%%r13),%%zmm15			\n\t"\
		"vsubpd	%%zmm0,%%zmm2,%%zmm2			\n\t		vsubpd	%%zmm8 ,%%zmm10,%%zmm10			\n\t"\
		"vsubpd	%%zmm1,%%zmm3,%%zmm3			\n\t		vsubpd	%%zmm9 ,%%zmm11,%%zmm11			\n\t"\
		"vsubpd	%%zmm4,%%zmm6,%%zmm6			\n\t		vsubpd	%%zmm14,%%zmm12,%%zmm12			\n\t"\
		"vsubpd	%%zmm5,%%zmm7,%%zmm7			\n\t		vsubpd	%%zmm15,%%zmm13,%%zmm13			\n\t"\
	"vmovaps	%%zmm15,(%%rax) 	\n\t"/* spill zmm15 to make room for 2.0 */"	vmovaps	 (%%r9),%%zmm15	\n\t"/* two */\
	"vfmadd132pd	%%zmm15,%%zmm2,%%zmm0		\n\t	vfmadd132pd	%%zmm15,%%zmm10,%%zmm8 	\n\t"\
	"vfmadd132pd	%%zmm15,%%zmm3,%%zmm1		\n\t	vfmadd132pd	%%zmm15,%%zmm11,%%zmm9 	\n\t"\
	"vfmadd132pd	%%zmm15,%%zmm6,%%zmm4		\n\t	vfmadd132pd	%%zmm15,%%zmm12,%%zmm14	\n\t"\
	"vfmadd132pd	%%zmm15,%%zmm7,%%zmm5		\n\t	vfmadd132pd	(%%rax),%%zmm13,%%zmm15	\n\t"\
		/* Finish radix-4 butterfly and store results into temporary-array slots: */\
		"vsubpd		%%zmm4,%%zmm0,%%zmm0		\n\t		vsubpd		%%zmm14,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd		%%zmm5,%%zmm1,%%zmm1		\n\t		vsubpd		%%zmm15,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd		%%zmm7,%%zmm2,%%zmm2		\n\t		vsubpd		%%zmm13,%%zmm10,%%zmm10		\n\t"\
		"vsubpd		%%zmm6,%%zmm3,%%zmm3		\n\t		vsubpd		%%zmm12,%%zmm11,%%zmm11		\n\t"\
	"vmovaps	%%zmm12,(%%rax) 	\n\t"/* spill zmm12 to make room for 2.0 */"	vmovaps	 (%%r9),%%zmm12	\n\t"/* two */\
	"vfmadd132pd	%%zmm12,%%zmm0,%%zmm4		\n\t	vfmadd132pd		%%zmm12,%%zmm8 ,%%zmm14		\n\t"\
	"vfmadd132pd	%%zmm12,%%zmm1,%%zmm5		\n\t	vfmadd132pd		%%zmm12,%%zmm9 ,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm12,%%zmm2,%%zmm7		\n\t	vfmadd132pd		%%zmm12,%%zmm10,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm12,%%zmm3,%%zmm6		\n\t	vfmadd132pd		(%%rax),%%zmm11,%%zmm12		\n\t"\
		"													vsubpd		%%zmm12,%%zmm10,%%zmm10		\n\t"\
		"													vsubpd		%%zmm11,%%zmm13,%%zmm13		\n\t"\
		"												vfmadd132pd		(%%r9 ),%%zmm10,%%zmm12		\n\t"/* .two */\
		"												vfmadd132pd		(%%r9 ),%%zmm13,%%zmm11		\n\t"\
		/* SSE2_RADIX8_DIF_COMBINE_RAD4_SUBS(r00,r10,r20,r30,r08,r18,r28,r38): */\
		"\n\t"\
		"movq	%[__o0],%%rax					\n\t		movq	%[__o4],%%r10				\n\t"\
		"movq	%[__o1],%%rbx					\n\t		movq	%[__o5],%%r11				\n\t"\
		"movq	%[__o2],%%rcx					\n\t		movq	%[__o6],%%r12				\n\t"\
		"movq	%[__o3],%%rdx					\n\t		movq	%[__o7],%%r13				\n\t"\
		/* Combine r00,r08,r20,r28: */						/* Combine r10,r18,r30,r38: */\
		"vsubpd		%%zmm14,%%zmm4 ,%%zmm4 		\n\t	vfnmadd231pd	(%%rsi),%%zmm10,%%zmm2 		\n\t"/* .isrt2 */\
		"vsubpd		%%zmm9 ,%%zmm0 ,%%zmm0 		\n\t	vfnmadd231pd	(%%rsi),%%zmm13,%%zmm3 		\n\t"\
		"vsubpd		%%zmm15,%%zmm5 ,%%zmm5 		\n\t	vfnmadd231pd	(%%rsi),%%zmm12,%%zmm6 		\n\t"\
		"vsubpd		%%zmm8 ,%%zmm1 ,%%zmm1 		\n\t	vfnmadd231pd	(%%rsi),%%zmm11,%%zmm7 		\n\t"\
	"vmovaps	%%zmm8 ,(%%rax) 	\n\t"/* spill zmm8  to make room for 2.0 */"	vmovaps	 (%%r9),%%zmm8 	\n\t"/* two */\
	"vmovaps	%%zmm11,(%%r10) 	\n\t"/* spill zmm11 to make room for sqrt2 */"	vmovaps	0x80(%%r9),%%zmm11 \n\t"/* sqrt2 */\
		"vmovaps	%%zmm4 ,    (%%rbx)			\n\t		vmovaps	%%zmm2 ,    (%%r11)			\n\t"\
		"vmovaps	%%zmm0 ,    (%%rcx)			\n\t		vmovaps	%%zmm3 ,0x40(%%r13)			\n\t"\
		"vmovaps	%%zmm5 ,0x40(%%rbx)			\n\t		vmovaps	%%zmm6 ,0x40(%%r11)			\n\t"\
		"vmovaps	%%zmm1 ,0x40(%%rdx)			\n\t		vmovaps	%%zmm7 ,    (%%r12)			\n\t"\
	"vfmadd132pd	%%zmm8 ,%%zmm4 ,%%zmm14		\n\t	vfmadd132pd		%%zmm11,%%zmm2 ,%%zmm10		\n\t"\
	"vfmadd132pd	%%zmm8 ,%%zmm0 ,%%zmm9 		\n\t	vfmadd132pd		%%zmm11,%%zmm3 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm8 ,%%zmm5 ,%%zmm15		\n\t	vfmadd132pd		%%zmm11,%%zmm6 ,%%zmm12		\n\t"\
	"vfmadd132pd	(%%rax),%%zmm1 ,%%zmm8 		\n\t	vfmadd132pd		(%%r10),%%zmm7 ,%%zmm11		\n\t"\
		"vmovaps	%%zmm14,    (%%rax)			\n\t		vmovaps	%%zmm10,    (%%r10)			\n\t"\
		"vmovaps	%%zmm9 ,    (%%rdx)			\n\t		vmovaps	%%zmm13,0x40(%%r12)			\n\t"\
		"vmovaps	%%zmm15,0x40(%%rax)			\n\t		vmovaps	%%zmm12,0x40(%%r10)			\n\t"\
		"vmovaps	%%zmm8 ,0x40(%%rcx)			\n\t		vmovaps	%%zmm11,    (%%r13)			\n\t"\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__i5] "e" (Xi5)\
		 ,[__i6] "e" (Xi6)\
		 ,[__i7] "e" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__two] "m" (Xtwo)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Need a 2nd version of above which takes the i-strides as intvars rather than literal bytes:
	#define SSE2_RADIX8_DIF_0TWIDDLE_B(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2,Xtwo)\
	{\
	__asm__ volatile (\
	/* 1st of 2 radix-4 subtransforms, data in ymm0-7: */	/* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\
		"movq	%[__isrt2],%%rsi				\n\t		movq	%[__two],%%r9	\n\t"/* r9 holds 2.0 throughout */\
		"movq	%[__r0],%%rax	/* i0 = r00 */	\n\t		movslq	%[__i1],%%r10		/* i1 */	\n\t"\
		"movslq	%[__i2],%%rbx	/* i2 */		\n\t		movslq	%[__i3],%%r11		/* i3 */	\n\t"\
		"movslq	%[__i4],%%rcx	/* i4 */		\n\t		movslq	%[__i5],%%r12		/* i5 */	\n\t"\
		"movslq	%[__i6],%%rdx	/* i6 */		\n\t		movslq	%[__i7],%%r13		/* i7 */	\n\t"\
		"addq	%%rax,%%rbx						\n\t		addq	%%rax,%%r10						\n\t"\
		"addq	%%rax,%%rcx						\n\t		addq	%%rax,%%r11						\n\t"\
		"addq	%%rax,%%rdx						\n\t		addq	%%rax,%%r12						\n\t"\
		"													addq	%%rax,%%r13						\n\t"\
	/* p0,4 combo: x+-y into zmm0/1, 2/3, resp: */		/* p1,5 combo: x+y into zmm8 /1, x-y in zmm10/3: */\
	/* p2,6 combo: x+-y into zmm4/5, 6/7, resp: */		/* p3,7 combo: x+y into zmm14/7, x-y in zmm12/5: */\
		"vmovaps	     (%%rcx),%%zmm0			\n\t		vmovaps	     (%%r12),%%zmm8 			\n\t"\
		"vmovaps	0x040(%%rcx),%%zmm1			\n\t		vmovaps	0x040(%%r12),%%zmm9 			\n\t"\
		"vmovaps	     (%%rax),%%zmm2			\n\t		vmovaps	     (%%r10),%%zmm10			\n\t"\
		"vmovaps	0x040(%%rax),%%zmm3			\n\t		vmovaps	0x040(%%r10),%%zmm11			\n\t"\
		"vmovaps	     (%%rdx),%%zmm4			\n\t		vmovaps	     (%%r11),%%zmm12			\n\t"\
		"vmovaps	0x040(%%rdx),%%zmm5			\n\t		vmovaps	0x040(%%r11),%%zmm13			\n\t"\
		"vmovaps	     (%%rbx),%%zmm6			\n\t		vmovaps	     (%%r13),%%zmm14			\n\t"\
		"vmovaps	0x040(%%rbx),%%zmm7			\n\t		vmovaps	0x040(%%r13),%%zmm15			\n\t"\
		"vsubpd	%%zmm0,%%zmm2,%%zmm2			\n\t		vsubpd	%%zmm8 ,%%zmm10,%%zmm10			\n\t"\
		"vsubpd	%%zmm1,%%zmm3,%%zmm3			\n\t		vsubpd	%%zmm9 ,%%zmm11,%%zmm11			\n\t"\
		"vsubpd	%%zmm4,%%zmm6,%%zmm6			\n\t		vsubpd	%%zmm14,%%zmm12,%%zmm12			\n\t"\
		"vsubpd	%%zmm5,%%zmm7,%%zmm7			\n\t		vsubpd	%%zmm15,%%zmm13,%%zmm13			\n\t"\
	"vmovaps	%%zmm15,(%%rax) 	\n\t"/* spill zmm15 to make room for 2.0 */"	vmovaps	 (%%r9),%%zmm15	\n\t"/* two */\
	"vfmadd132pd	%%zmm15,%%zmm2,%%zmm0		\n\t	vfmadd132pd	%%zmm15,%%zmm10,%%zmm8 	\n\t"\
	"vfmadd132pd	%%zmm15,%%zmm3,%%zmm1		\n\t	vfmadd132pd	%%zmm15,%%zmm11,%%zmm9 	\n\t"\
	"vfmadd132pd	%%zmm15,%%zmm6,%%zmm4		\n\t	vfmadd132pd	%%zmm15,%%zmm12,%%zmm14	\n\t"\
	"vfmadd132pd	%%zmm15,%%zmm7,%%zmm5		\n\t	vfmadd132pd	(%%rax),%%zmm13,%%zmm15	\n\t"\
		/* Finish radix-4 butterfly and store results into temporary-array slots: */\
		"vsubpd		%%zmm4,%%zmm0,%%zmm0		\n\t		vsubpd		%%zmm14,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd		%%zmm5,%%zmm1,%%zmm1		\n\t		vsubpd		%%zmm15,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd		%%zmm7,%%zmm2,%%zmm2		\n\t		vsubpd		%%zmm13,%%zmm10,%%zmm10		\n\t"\
		"vsubpd		%%zmm6,%%zmm3,%%zmm3		\n\t		vsubpd		%%zmm12,%%zmm11,%%zmm11		\n\t"\
	"vmovaps	%%zmm12,(%%rax) 	\n\t"/* spill zmm12 to make room for 2.0 */"	vmovaps	 (%%r9),%%zmm12	\n\t"/* two */\
	"vfmadd132pd	%%zmm12,%%zmm0,%%zmm4		\n\t	vfmadd132pd		%%zmm12,%%zmm8 ,%%zmm14		\n\t"\
	"vfmadd132pd	%%zmm12,%%zmm1,%%zmm5		\n\t	vfmadd132pd		%%zmm12,%%zmm9 ,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm12,%%zmm2,%%zmm7		\n\t	vfmadd132pd		%%zmm12,%%zmm10,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm12,%%zmm3,%%zmm6		\n\t	vfmadd132pd		(%%rax),%%zmm11,%%zmm12		\n\t"\
		"													vsubpd		%%zmm12,%%zmm10,%%zmm10		\n\t"\
		"													vsubpd		%%zmm11,%%zmm13,%%zmm13		\n\t"\
		"												vfmadd132pd		(%%r9 ),%%zmm10,%%zmm12		\n\t"/* .two */\
		"												vfmadd132pd		(%%r9 ),%%zmm13,%%zmm11		\n\t"\
		/* SSE2_RADIX8_DIF_COMBINE_RAD4_SUBS(r00,r10,r20,r30,r08,r18,r28,r38): */\
		"\n\t"\
		"movq	%[__o0],%%rax					\n\t		movq	%[__o4],%%r10				\n\t"\
		"movq	%[__o1],%%rbx					\n\t		movq	%[__o5],%%r11				\n\t"\
		"movq	%[__o2],%%rcx					\n\t		movq	%[__o6],%%r12				\n\t"\
		"movq	%[__o3],%%rdx					\n\t		movq	%[__o7],%%r13				\n\t"\
		/* Combine r00,r08,r20,r28: */						/* Combine r10,r18,r30,r38: */\
		"vsubpd		%%zmm14,%%zmm4 ,%%zmm4 		\n\t	vfnmadd231pd	(%%rsi),%%zmm10,%%zmm2 		\n\t"/* .isrt2 */\
		"vsubpd		%%zmm9 ,%%zmm0 ,%%zmm0 		\n\t	vfnmadd231pd	(%%rsi),%%zmm13,%%zmm3 		\n\t"\
		"vsubpd		%%zmm15,%%zmm5 ,%%zmm5 		\n\t	vfnmadd231pd	(%%rsi),%%zmm12,%%zmm6 		\n\t"\
		"vsubpd		%%zmm8 ,%%zmm1 ,%%zmm1 		\n\t	vfnmadd231pd	(%%rsi),%%zmm11,%%zmm7 		\n\t"\
	"vmovaps	%%zmm8 ,(%%rax) 	\n\t"/* spill zmm8  to make room for 2.0 */"	vmovaps	 (%%r9),%%zmm8 	\n\t"/* two */\
	"vmovaps	%%zmm11,(%%r10) 	\n\t"/* spill zmm11 to make room for sqrt2 */"	vmovaps	0x80(%%r9),%%zmm11 \n\t"/* sqrt2 */\
		"vmovaps	%%zmm4 ,    (%%rbx)			\n\t		vmovaps	%%zmm2 ,    (%%r11)			\n\t"\
		"vmovaps	%%zmm0 ,    (%%rcx)			\n\t		vmovaps	%%zmm3 ,0x40(%%r13)			\n\t"\
		"vmovaps	%%zmm5 ,0x40(%%rbx)			\n\t		vmovaps	%%zmm6 ,0x40(%%r11)			\n\t"\
		"vmovaps	%%zmm1 ,0x40(%%rdx)			\n\t		vmovaps	%%zmm7 ,    (%%r12)			\n\t"\
	"vfmadd132pd	%%zmm8 ,%%zmm4 ,%%zmm14		\n\t	vfmadd132pd		%%zmm11,%%zmm2 ,%%zmm10		\n\t"\
	"vfmadd132pd	%%zmm8 ,%%zmm0 ,%%zmm9 		\n\t	vfmadd132pd		%%zmm11,%%zmm3 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm8 ,%%zmm5 ,%%zmm15		\n\t	vfmadd132pd		%%zmm11,%%zmm6 ,%%zmm12		\n\t"\
	"vfmadd132pd	(%%rax),%%zmm1 ,%%zmm8 		\n\t	vfmadd132pd		(%%r10),%%zmm7 ,%%zmm11		\n\t"\
		"vmovaps	%%zmm14,    (%%rax)			\n\t		vmovaps	%%zmm10,    (%%r10)			\n\t"\
		"vmovaps	%%zmm9 ,    (%%rdx)			\n\t		vmovaps	%%zmm13,0x40(%%r12)			\n\t"\
		"vmovaps	%%zmm15,0x40(%%rax)			\n\t		vmovaps	%%zmm12,0x40(%%r10)			\n\t"\
		"vmovaps	%%zmm8 ,0x40(%%rcx)			\n\t		vmovaps	%%zmm11,    (%%r13)			\n\t"\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__two] "m" (Xtwo)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// AVX analog of dft_macro.h::RADIX_08_DIF_TWIDDLE_OOP - Result of adding separate I/O addressing to
	// radix8_dif_dit_pass_gcc64.h::SSE2_RADIX8_DIF_TWIDDLE.
	//*** Dec 2020: For guide to updated reduced-#arg IO address computation, cf. the SSE2 version of this macro. ***
	//
	// [rsi] (and if needed rdi) points to sine components of each sincos pair, which is not really a pair here in terms of relative addressing.
	//
	#define SSE2_RADIX8_DIF_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xoff, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
		"xorq	%%r8,%%r8	\n\t	leaq	%c[i1](%%r8),%%r8	\n\t"/* movq|movslq of literal %c[i1] both segfaulted, workaround via LEA */\
		"movq	%[in0],%%rax		\n\t	leaq	(%%rax,%%r8,4),%%r10	\n\t"/* [lcol,rcol] base-addresses = in0 + [0,4*istride] */\
		"movq	%[twid_ptrs],%%rsi	\n\t"\
		"leaq	(%%rax,%%r8),%%rbx	\n\t	leaq	(%%r10,%%r8  ),%%r11	\n\t"\
		/* The twid_ptrs[] array holds ptrs to 14 complex twiddles in BR order: (c,s)[4,2,6,1,5,3,7]: */\
		"movq	0x30(%%rsi),%%r12	\n\t	movq	0x40(%%rsi),%%r14	\n\t	movq	    (%%rsi),%%rcx	\n\t"/* c1,5,4 */\
		"movq	0x38(%%rsi),%%r13	\n\t	movq	0x48(%%rsi),%%r15	\n\t	movq	0x08(%%rsi),%%rsi	\n\t"/* s1,5,4 ... do c4,s4 last because s4-result overwrites rsi */\
		"vmovaps	    (%%rbx)	,%%zmm2			\n\t		vmovaps	    (%%r10)	,%%zmm8 			\n\t"\
		"vmovaps	0x40(%%rbx)	,%%zmm3			\n\t		vmovaps	0x40(%%r10)	,%%zmm10			\n\t"\
		"vmovaps		%%zmm2	,%%zmm4			\n\t		vmovaps		%%zmm8 	,%%zmm9 			\n\t"\
		"vmovaps		%%zmm3	,%%zmm5			\n\t		vmovaps		%%zmm10	,%%zmm11			\n\t"\
		"vmulpd		(%%rcx)	,%%zmm2,%%zmm2		\n\t		vmulpd		(%%r12)	,%%zmm8 ,%%zmm8 	\n\t"\
		"vmulpd		(%%rcx)	,%%zmm3,%%zmm3		\n\t		vmulpd		(%%r13)	,%%zmm9 ,%%zmm9 	\n\t"\
	"vfnmadd231pd	(%%rsi)	,%%zmm5,%%zmm2		\n\t	vfnmadd231pd	(%%r13)	,%%zmm10,%%zmm8 	\n\t"\
	" vfmadd231pd	(%%rsi)	,%%zmm4,%%zmm3		\n\t	 vfmadd231pd	(%%r12)	,%%zmm11,%%zmm9 	\n\t"\
		"vmovaps	    (%%rax)	,%%zmm0			\n\t		vmovaps	    (%%r11)	,%%zmm10			\n\t"\
		"vmovaps	0x40(%%rax)	,%%zmm1			\n\t		vmovaps	0x40(%%r11)	,%%zmm11			\n\t"\
		"vmovaps		%%zmm0	,%%zmm6			\n\t		vmovaps	    (%%r11)	,%%zmm12			\n\t"\
		"vmovaps		%%zmm1	,%%zmm7			\n\t		vmovaps	0x40(%%r11)	,%%zmm13			\n\t"\
		"vaddpd	%%zmm2		,%%zmm0,%%zmm0		\n\t		vmulpd		(%%r14)	,%%zmm10,%%zmm10	\n\t"\
		"vaddpd	%%zmm3		,%%zmm1,%%zmm1		\n\t		vmulpd		(%%r15)	,%%zmm12,%%zmm12	\n\t"\
		"vsubpd	%%zmm2		,%%zmm6,%%zmm6		\n\t	vfnmadd231pd	(%%r15)	,%%zmm11,%%zmm10	\n\t	vmovaps	%%zmm10,%%zmm11	\n\t"\
		"vsubpd	%%zmm3		,%%zmm7,%%zmm7		\n\t	 vfmadd231pd	(%%r14)	,%%zmm13,%%zmm12	\n\t	vmovaps	%%zmm12,%%zmm13	\n\t"\
		"vmovaps	%%zmm0		,    (%%rax)	\n\t		vaddpd	%%zmm8 		,%%zmm10,%%zmm10	\n\t"\
		"vmovaps	%%zmm1		,0x40(%%rax)	\n\t		vsubpd	%%zmm11		,%%zmm8 ,%%zmm8 	\n\t"\
		"vmovaps	%%zmm6		,    (%%rbx)	\n\t		vaddpd	%%zmm9 		,%%zmm12,%%zmm12	\n\t"\
		"vmovaps	%%zmm7		,0x40(%%rbx)	\n\t		vsubpd	%%zmm13		,%%zmm9 ,%%zmm9 	\n\t"\
		"leaq	(%%rax,%%r8,2),%%rax			\n\t		vmovaps	%%zmm10		,    (%%r10)	\n\t"\
		"leaq	(%%rbx,%%r8,2),%%rbx			\n\t		vmovaps	%%zmm12		,0x40(%%r10)	\n\t"\
		"movq	%[twid_ptrs],%%r15				\n\t"\
		"movq		0x10(%%r15),%%rcx			\n\t		vmovaps	%%zmm8 		,    (%%r11)	\n\t"/* c2 */\
		"movq		0x20(%%r15),%%rdx			\n\t		vmovaps	%%zmm9 		,0x40(%%r11)	\n\t"/* c6 */\
		"movq		0x18(%%r15),%%rsi	\n\t	leaq	(%%r10,%%r8,2),%%r10	\n\t	movq 0x50(%%r15),%%r12 \n\t movq 0x58(%%r15),%%r13	\n\t"/* s2, c3,s3 */\
		"movq		0x28(%%r15),%%rdi	\n\t	leaq	(%%r11,%%r8,2),%%r11	\n\t	movq 0x60(%%r15),%%r14 \n\t movq 0x68(%%r15),%%r15	\n\t"/* s6, c7,s7 */\
		"vmovaps	    (%%rax)	,%%zmm0			\n\t		vmovaps	    (%%r10)	,%%zmm8 			\n\t"\
		"vmovaps	0x40(%%rax)	,%%zmm2			\n\t		vmovaps	0x40(%%r10)	,%%zmm10			\n\t"\
		"vmovaps		%%zmm0	,%%zmm1			\n\t		vmovaps		%%zmm8 	,%%zmm9 			\n\t"\
		"vmovaps		%%zmm2	,%%zmm3			\n\t		vmovaps		%%zmm10	,%%zmm11			\n\t"\
		"vmulpd		(%%rcx)	,%%zmm0,%%zmm0		\n\t		vmulpd		(%%r12)	,%%zmm8 ,%%zmm8 	\n\t"\
		"vmulpd		(%%rsi)	,%%zmm1,%%zmm1		\n\t		vmulpd		(%%r13)	,%%zmm9 ,%%zmm9 	\n\t"\
	"vfnmadd231pd	(%%rsi)	,%%zmm2,%%zmm0		\n\t	vfnmadd231pd	(%%r13)	,%%zmm10,%%zmm8 	\n\t"\
	" vfmadd231pd	(%%rcx)	,%%zmm3,%%zmm1		\n\t	 vfmadd231pd	(%%r12)	,%%zmm11,%%zmm9 	\n\t"\
		"vmovaps	    (%%rbx)	,%%zmm2			\n\t		vmovaps	    (%%r11)	,%%zmm10			\n\t"\
		"vmovaps	0x40(%%rbx)	,%%zmm3			\n\t		vmovaps	0x40(%%r11)	,%%zmm11			\n\t"\
		"vmovaps		%%zmm2	,%%zmm4			\n\t		vmovaps		%%zmm10	,%%zmm12			\n\t"\
		"vmovaps		%%zmm3	,%%zmm5			\n\t		vmovaps		%%zmm11	,%%zmm13			\n\t"\
		"vmulpd		(%%rdx)	,%%zmm2,%%zmm2		\n\t		vmulpd		(%%r14)	,%%zmm10,%%zmm10	\n\t"\
		"vmulpd		(%%rdi)	,%%zmm4,%%zmm4		\n\t		vmulpd		(%%r15)	,%%zmm12,%%zmm12	\n\t"\
	"vfnmadd231pd	(%%rdi)	,%%zmm3,%%zmm2		\n\t	vfnmadd231pd	(%%r15)	,%%zmm11,%%zmm10	\n\t"\
	" vfmadd231pd	(%%rdx)	,%%zmm5,%%zmm4		\n\t	 vfmadd231pd	(%%r14)	,%%zmm13,%%zmm12	\n\t"\
		"vmovaps	%%zmm2		,%%zmm3			\n\t		vmovaps	%%zmm10		,%%zmm11			\n\t"\
		"vmovaps	%%zmm4		,%%zmm5			\n\t		vmovaps	%%zmm12		,%%zmm13			\n\t"\
		"vaddpd	%%zmm0		,%%zmm2,%%zmm2		\n\t		vaddpd	%%zmm8 		,%%zmm10,%%zmm10	\n\t"\
		"vsubpd	%%zmm3		,%%zmm0,%%zmm0		\n\t		vsubpd	%%zmm11		,%%zmm8 ,%%zmm8 	\n\t"\
		"vaddpd	%%zmm1		,%%zmm4,%%zmm4		\n\t		vaddpd	%%zmm9 		,%%zmm12,%%zmm12	\n\t"\
		"vsubpd	%%zmm5		,%%zmm1,%%zmm1		\n\t		vsubpd	%%zmm13		,%%zmm9 ,%%zmm9 	\n\t"\
		"vmovaps	%%zmm2		,    (%%rax)	\n\t		vmovaps	%%zmm10		,    (%%r10)	\n\t"\
		"vmovaps	%%zmm4		,0x40(%%rax)	\n\t		vmovaps	%%zmm12		,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm0		,    (%%rbx)	\n\t		vmovaps	%%zmm8 		,    (%%r11)	\n\t"\
		"vmovaps	%%zmm1		,0x40(%%rbx)	\n\t		vmovaps	%%zmm9 		,0x40(%%r11)	\n\t"\
	/* combine to get 2 length-4 output subtransforms.
	In this step 2 of the 8-dft, we need address-pairs
		lcol:		rcol:
		i0,2,1,3	i4,6,5,7
		o0,2,1,3	o4,6,5,7
	At this point r[a|b]x have i2,3, r1[0|1] have i6,7, but cleaner to reload add0 and go from there.
	Since we will be loading o-addresses into regs starting with r[a|b]x and r1[0|1], use r[c|d]x and r1[2|3]
	for the I-address pairs here: */\
	"movq	%[in0],%%rcx			\n\t		leaq	(%%rcx,%%r8  ),%%r12	\n\t"/* [lcol,rcol] base-addresses = in0 + [0,1*istride] */\
	"leaq	(%%rcx,%%r8,2),%%rdx	\n\t		leaq	(%%r12,%%r8,2),%%r13	\n\t"/* in0 + [2,3*istride] */\
		"vmovaps	    (%%rdx)	,%%zmm0			\n\t		vmovaps	    (%%r13)	,%%zmm9 			\n\t"\
		"vmovaps	0x40(%%rdx)	,%%zmm1			\n\t		vmovaps	0x40(%%r13)	,%%zmm12			\n\t"\
		"vmovaps	    (%%rcx)	,%%zmm4			\n\t		vmovaps	    (%%r12)	,%%zmm8 			\n\t"\
		"vmovaps	0x40(%%rcx)	,%%zmm5			\n\t		vmovaps	0x40(%%r12)	,%%zmm13			\n\t"\
	"shlq	$2,%%r8			\n\t"/* From here on only need offset i4 = 4*i1 */\
	"addq	%%r8,%%rdx				\n\t		addq	%%r8,%%r13				\n\t"/* in0 + [6,7*istride] */\
	"addq	%%r8,%%rcx				\n\t		addq	%%r8,%%r12				\n\t"/* in0 + [4,5*istride] */\
		"vmovaps	    (%%rdx)	,%%zmm2			\n\t		vmovaps	    (%%r13)	,%%zmm11			\n\t"\
		"vmovaps	0x40(%%rdx)	,%%zmm3			\n\t		vmovaps	0x40(%%r13)	,%%zmm14			\n\t"\
		"vmovaps	    (%%rcx)	,%%zmm6			\n\t		vmovaps	    (%%r12)	,%%zmm10			\n\t"\
		"vmovaps	0x40(%%rcx)	,%%zmm7			\n\t		vmovaps	0x40(%%r12)	,%%zmm15			\n\t"\
	"movq	%[out0]	,%%rsi			\n\t	movq	%[off]	,%%rdi			\n\t"/* Load output base-address into rsi and offset-array pointer into rdi */\
	"movslq		    (%%rdi),%%rax	\n\t	movslq		0x10(%%rdi),%%r10	\n\t"/*        off[0,4] */\
	"leaq	(%%rsi,%%rax,8),%%rax	\n\t	leaq	(%%rsi,%%r10,8),%%r10	\n\t"/* out0 + off[0,4] */\
	"movslq		0x08(%%rdi),%%rbx	\n\t	movslq		0x18(%%rdi),%%r11	\n\t"\
	"leaq	(%%rsi,%%rbx,8),%%rbx	\n\t	leaq	(%%rsi,%%r11,8),%%r11	\n\t"/* out0 + off[2,6] */\
		"vsubpd		%%zmm0	,%%zmm4,%%zmm4		\n\t		vsubpd		%%zmm9 ,%%zmm13,%%zmm13		\n\t"\
		"vsubpd		%%zmm1	,%%zmm5,%%zmm5		\n\t		vsubpd		%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd		%%zmm2	,%%zmm6,%%zmm6		\n\t		vsubpd		%%zmm11,%%zmm15,%%zmm15		\n\t"\
		"vsubpd		%%zmm3	,%%zmm7,%%zmm7		\n\t		vsubpd		%%zmm14,%%zmm10,%%zmm10		\n\t"\
	"movq	%[two]	,%%r15			\n\t"\
	"vmovaps	%%zmm14,(%%rbx) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	 (%%r15),%%zmm14	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm4,%%zmm0		\n\t	vfmadd132pd		%%zmm14,%%zmm13,%%zmm9 		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm5,%%zmm1		\n\t	vfmadd132pd		%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm6,%%zmm2		\n\t	vfmadd132pd		%%zmm14,%%zmm15,%%zmm11		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm7,%%zmm3		\n\t	vfmadd132pd		(%%rbx),%%zmm10,%%zmm14		\n\t"\
		"vsubpd	%%zmm2		,%%zmm0,%%zmm0		\n\t		vsubpd	%%zmm11		,%%zmm10,%%zmm10	\n\t"\
		"vsubpd	%%zmm3		,%%zmm1,%%zmm1		\n\t		vsubpd	%%zmm14		,%%zmm15,%%zmm15	\n\t"\
		"vsubpd	%%zmm7		,%%zmm4,%%zmm4		\n\t		vfmadd132pd	(%%r15) ,%%zmm10,%%zmm11	\n\t"\
		"vsubpd	%%zmm6		,%%zmm5,%%zmm5		\n\t		vfmadd132pd	(%%r15) ,%%zmm15,%%zmm14	\n\t"\
	"movslq		0x04(%%rdi),%%rcx	\n\t	movslq		0x14(%%rdi),%%r12	\n\t"\
	"leaq	(%%rsi,%%rcx,8),%%rcx	\n\t	leaq	(%%rsi,%%r12,8),%%r12	\n\t"/* out0 + off[1,5] */\
	"movslq		0x0c(%%rdi),%%rdx	\n\t	movslq		0x1c(%%rdi),%%r13	\n\t"\
	"leaq	(%%rsi,%%rdx,8),%%rdx	\n\t	leaq	(%%rsi,%%r13,8),%%r13	\n\t"/* out0 + off[3,7] */\
	/* Use the cosine term of the [c1,s1] pair, which is the *middle* [4th of 7] of our 7 input pairs, in terms \
	of the input-arg bit-reversal reordering defined in the __X[c,s] --> [c,s] mapping below and happens to \
	always in fact *be* a true cosine term, which is a requirement for our "decr 1 gives isrt2" data-copy scheme: */\
		"movq	%[twid_ptrs],%%r14		\n\t	movq	0x30(%%r14),%%r14	\n\t"\
		"										subq	$0x40,%%r14	\n\t"/* isrt2 in [c1]-1 */\
	"vfmadd132pd	(%%r15),%%zmm0,%%zmm2		\n\t	vfnmadd231pd	(%%r14),%%zmm10,%%zmm8 		\n\t"/* .isrt2 */\
	"vfmadd132pd	(%%r15),%%zmm1,%%zmm3		\n\t	vfnmadd231pd	(%%r14),%%zmm15,%%zmm13		\n\t"\
	"vfmadd132pd	(%%r15),%%zmm5,%%zmm6		\n\t	vfnmadd231pd	(%%r14),%%zmm11,%%zmm9 		\n\t"\
	"vfmadd132pd	(%%r15),%%zmm4,%%zmm7		\n\t	vfnmadd231pd	(%%r14),%%zmm14,%%zmm12		\n\t"\
		"vmovaps	%%zmm2,    (%%rax)	\n\t"/* [o0].re */"	vmovaps	%%zmm8 ,    (%%r12)	\n\t"/* [o5].re */	"vmovaps	(%%r14),%%zmm2		\n\t"/* zmm2 = ISRT2 */\
		"vmovaps	%%zmm3,0x40(%%rax)	\n\t"/* [o0].im */"	vmovaps	%%zmm13,0x40(%%r11)	\n\t"/* [o6].im */	"vaddpd	%%zmm2,%%zmm2,%%zmm2	\n\t"/* zmm2 = SQRT2; */\
		"vmovaps	%%zmm6,0x40(%%rbx)	\n\t"/* [o2].im */"	vmovaps	%%zmm9 ,0x40(%%r12)	\n\t"/* [o5].im */\
		"vmovaps	%%zmm7,    (%%rdx)	\n\t"/* [o3].re */"	vmovaps	%%zmm12,    (%%r11)	\n\t"/* [o6].re */\
		"												 vfmadd132pd	%%zmm2,%%zmm8 ,%%zmm10		\n\t"/* .sqrt2 */\
		"												 vfmadd132pd	%%zmm2,%%zmm13,%%zmm15		\n\t"\
		"												 vfmadd132pd	%%zmm2,%%zmm9 ,%%zmm11		\n\t"\
		"												 vfmadd132pd	%%zmm2,%%zmm12,%%zmm14		\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)	\n\t"/* [o1].re */"	vmovaps	%%zmm10,    (%%r10)	\n\t"/* [o4].re */\
		"vmovaps	%%zmm1,0x40(%%rcx)	\n\t"/* [o1].im */"	vmovaps	%%zmm15,0x40(%%r13)	\n\t"/* [o7].im */\
		"vmovaps	%%zmm4,    (%%rbx)	\n\t"/* [o2].re */"	vmovaps	%%zmm11,0x40(%%r10)	\n\t"/* [o4].im */\
		"vmovaps	%%zmm5,0x40(%%rdx)	\n\t"/* [o3].im */"	vmovaps	%%zmm14,    (%%r13)	\n\t"/* [o7].re */\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "e" (Xi1)	/* ...except for 'e'-inputs which are literal byte offsets */\
		 ,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		 ,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Twiddleless version of SSE2_RADIX8_DIT_TWIDDLE. Inputs enter in memory locations __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7.
	Outputs go into 16 contiguous 32-byte memory locations starting at __out and assumed disjoint with inputs.
	This macro built on the same code template as SSE2_RADIX8_DIF_TWIDDLE0, but with the I/O-location indices mutually bit reversed:
	01234567 <--> 04261537, which can be effected via the pairwise swaps 1 <--> 4 and 3 <--> 6.
	*/
	#define	SSE2_RADIX8_DIT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xout, Xisrt2,Xtwo)\
	{\
	__asm__ volatile (\
		"movq	%[__isrt2],%%rdi				\n\t		movq	%[__two],%%r9	\n\t"/* r9 holds 2.0 throughout */\
		"movq	%[__out],%%rsi	\n\t"\
	/* 1st of 2 radix-4 subtransforms, data in zmm0-7: *//* 2nd of 2 radix-4 subtransforms, data in zmm8-15: */\
		"movq	%[__i0],%%rax					\n\t		movq	%[__i4],%%r10					\n\t"\
		"movq	%[__i1],%%rbx					\n\t		movq	%[__i5],%%r11					\n\t"\
		"movq	%[__i2],%%rcx					\n\t		movq	%[__i6],%%r12					\n\t"\
		"movq	%[__i3],%%rdx					\n\t		movq	%[__i7],%%r13					\n\t"\
		"vmovaps	    (%%rax),%%zmm2				\n\t		vmovaps	    (%%r10),%%zmm10	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm3				\n\t		vmovaps	0x40(%%r10),%%zmm11	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6				\n\t		vmovaps	    (%%r11),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7				\n\t		vmovaps	0x40(%%r11),%%zmm9 	\n\t"\
		"vmovaps	    (%%rbx),%%zmm0				\n\t		vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	    (%%rdx),%%zmm4				\n\t		vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm1				\n\t		vmovaps	    (%%r13),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5				\n\t"	/*	vmovaps	0x40(%%r13),%%zmm13	Instead use zmm13 for 2.0: */"	vmovaps	(%%r9),%%zmm13 	\n\t"\
		"vsubpd		%%zmm0,%%zmm2,%%zmm2			\n\t		vsubpd		%%zmm8 ,%%zmm10,%%zmm10		\n\t"\
		"vsubpd		%%zmm1,%%zmm3,%%zmm3			\n\t		vsubpd		%%zmm9 ,%%zmm11,%%zmm11		\n\t"\
		"vsubpd		%%zmm4,%%zmm6,%%zmm6			\n\t		vsubpd		%%zmm12,%%zmm14,%%zmm14		\n\t"\
		"vsubpd		%%zmm5,%%zmm7,%%zmm7			\n\t		vsubpd	0x40(%%r13),%%zmm15,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm2,%%zmm0			\n\t	vfmadd132pd		%%zmm13,%%zmm10,%%zmm8 		\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm3,%%zmm1			\n\t	vfmadd132pd		%%zmm13,%%zmm11,%%zmm9 		\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm6,%%zmm4			\n\t	vfmadd132pd		%%zmm13,%%zmm14,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm7,%%zmm5			\n\t	vfmadd132pd	0x40(%%r13),%%zmm15,%%zmm13		\n\t"\
		"vsubpd		%%zmm7,%%zmm2,%%zmm2			\n\t		vsubpd		%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd		%%zmm6,%%zmm3,%%zmm3			\n\t		vsubpd		%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd		%%zmm4,%%zmm0,%%zmm0			\n\t		vsubpd		%%zmm14,%%zmm11,%%zmm11		\n\t"\
		"vsubpd		%%zmm5,%%zmm1,%%zmm1			\n\t		vsubpd		%%zmm15,%%zmm10,%%zmm10		\n\t"\
	"vmovaps	%%zmm14,(%%rsi) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r9),%%zmm14 	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm4			\n\t	vfmadd132pd		%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm5			\n\t	vfmadd132pd		%%zmm14,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm2,%%zmm7			\n\t	vfmadd132pd		%%zmm14,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm3,%%zmm6			\n\t	vfmadd132pd		(%%rsi),%%zmm11,%%zmm14		\n\t"\
		"														vsubpd		%%zmm15,%%zmm11,%%zmm11		\n\t"\
		"														vsubpd		%%zmm10,%%zmm14,%%zmm14		\n\t"\
		"													vfmadd132pd		(%%r9 ),%%zmm11,%%zmm15		\n\t"/* .two */\
		"													vfmadd132pd		(%%r9 ),%%zmm14,%%zmm10		\n\t"\
		/* Outputs 1-7 order-reversed in the SIMD version of this macro: Thus swap output byte-offset pairs */\
		/* 0x[40,60] <-> [1c0,1e0], [80,a0] <-> [180,1a0], [c0,e0] <-> [140,160] : */\
		"vsubpd		%%zmm9 ,%%zmm0 ,%%zmm0 			\n\t	vfnmadd231pd	(%%rdi),%%zmm14,%%zmm2 		\n\t"/* .isrt2 */\
		"vsubpd		%%zmm13,%%zmm5 ,%%zmm5 			\n\t	vfnmadd231pd	(%%rdi),%%zmm11,%%zmm3 		\n\t"\
		"vsubpd		%%zmm12,%%zmm4 ,%%zmm4 			\n\t	vfnmadd231pd	(%%rdi),%%zmm15,%%zmm7 		\n\t"\
		"vsubpd		%%zmm8 ,%%zmm1 ,%%zmm1 			\n\t	vfnmadd231pd	(%%rdi),%%zmm10,%%zmm6 		\n\t"\
	"vmovaps	%%zmm8 ,0x40(%%rsi) 	\n\t"/* spill zmm8  to make room for 2.0 */"	vmovaps	 (%%r9),%%zmm8	\n\t"/* two */\
	"vmovaps	%%zmm10,(%%rsi) 	\n\t"/* spill zmm10 to make room for sqrt2 */"	vmovaps	0x80(%%r9),%%zmm10	\n\t"/* sqrt2 */\
		"vmovaps	%%zmm0 ,0x100(%%rsi)			\n\t		vmovaps		%%zmm2 ,0x080(%%rsi)		\n\t"\
		"vmovaps	%%zmm5 ,0x240(%%rsi)			\n\t		vmovaps		%%zmm3 ,0x1c0(%%rsi)		\n\t"\
		"vmovaps	%%zmm4 ,0x200(%%rsi)			\n\t		vmovaps		%%zmm7 ,0x180(%%rsi)		\n\t"\
		"vmovaps	%%zmm1 ,0x340(%%rsi)			\n\t		vmovaps		%%zmm6 ,0x2c0(%%rsi)		\n\t"\
	"vfmadd132pd		%%zmm8 ,%%zmm0 ,%%zmm9 		\n\t	vfmadd132pd		%%zmm10,%%zmm2 ,%%zmm14		\n\t"\
	"vfmadd132pd		%%zmm8 ,%%zmm5 ,%%zmm13		\n\t	vfmadd132pd		%%zmm10,%%zmm3 ,%%zmm11		\n\t"\
	"vfmadd132pd		%%zmm8 ,%%zmm4 ,%%zmm12		\n\t	vfmadd132pd		%%zmm10,%%zmm7 ,%%zmm15		\n\t"\
	"vfmadd132pd	0x40(%%rsi),%%zmm1 ,%%zmm8 		\n\t	vfmadd132pd		(%%rsi),%%zmm6 ,%%zmm10		\n\t"\
		"vmovaps	%%zmm9 ,0x300(%%rsi)			\n\t		vmovaps		%%zmm14,0x280(%%rsi)		\n\t"\
		"vmovaps	%%zmm13,0x040(%%rsi)			\n\t		vmovaps		%%zmm11,0x3c0(%%rsi)		\n\t"\
		"vmovaps	%%zmm12,     (%%rsi)			\n\t		vmovaps		%%zmm15,0x380(%%rsi)		\n\t"\
		"vmovaps	%%zmm8 ,0x140(%%rsi)			\n\t		vmovaps		%%zmm10,0x0c0(%%rsi)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__out] "m" (Xout)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__two] "m" (Xtwo)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Same as SSE2_RADIX8_DIT_0TWIDDLE but with user-specifiable [i.e. not nec. contiguous] output addresses:
	#define	SSE2_RADIX8_DIT_0TWIDDLE_OOP(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2,Xtwo)\
	{\
	__asm__ volatile (\
		"movq	%[__isrt2],%%rdi				\n\t		movq	%[__two],%%r9	\n\t"/* r9 holds 2.0 throughout */\
	/* 1st of 2 radix-4 subtransforms, data in zmm0-7: *//* 2nd of 2 radix-4 subtransforms, data in zmm8-15: */\
		"movq	%[__i0],%%rax					\n\t		movq	%[__i4],%%r10					\n\t"\
		"movq	%[__i1],%%rbx					\n\t		movq	%[__i5],%%r11					\n\t"\
		"movq	%[__i2],%%rcx					\n\t		movq	%[__i6],%%r12					\n\t"\
		"movq	%[__i3],%%rdx					\n\t		movq	%[__i7],%%r13					\n\t"\
		"vmovaps	    (%%rax),%%zmm2				\n\t		vmovaps	    (%%r10),%%zmm10	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm3				\n\t		vmovaps	0x40(%%r10),%%zmm11	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6				\n\t		vmovaps	    (%%r11),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7				\n\t		vmovaps	0x40(%%r11),%%zmm9 	\n\t"\
		"vmovaps	    (%%rbx),%%zmm0				\n\t		vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	    (%%rdx),%%zmm4				\n\t		vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm1				\n\t		vmovaps	    (%%r13),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5				\n\t"	/*	vmovaps	0x40(%%r13),%%zmm13	Instead use zmm13 for 2.0: */"	vmovaps	(%%r9),%%zmm13 	\n\t"\
		"vsubpd		%%zmm0,%%zmm2,%%zmm2			\n\t		vsubpd		%%zmm8 ,%%zmm10,%%zmm10		\n\t"\
		"vsubpd		%%zmm1,%%zmm3,%%zmm3			\n\t		vsubpd		%%zmm9 ,%%zmm11,%%zmm11		\n\t"\
		"vsubpd		%%zmm4,%%zmm6,%%zmm6			\n\t		vsubpd		%%zmm12,%%zmm14,%%zmm14		\n\t"\
		"vsubpd		%%zmm5,%%zmm7,%%zmm7			\n\t		vsubpd	0x40(%%r13),%%zmm15,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm2,%%zmm0			\n\t	vfmadd132pd		%%zmm13,%%zmm10,%%zmm8 		\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm3,%%zmm1			\n\t	vfmadd132pd		%%zmm13,%%zmm11,%%zmm9 		\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm6,%%zmm4			\n\t	vfmadd132pd		%%zmm13,%%zmm14,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm7,%%zmm5			\n\t	vfmadd132pd	0x40(%%r13),%%zmm15,%%zmm13		\n\t"\
		"vsubpd		%%zmm7,%%zmm2,%%zmm2			\n\t		vsubpd		%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd		%%zmm6,%%zmm3,%%zmm3			\n\t		vsubpd		%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd		%%zmm4,%%zmm0,%%zmm0			\n\t		vsubpd		%%zmm14,%%zmm11,%%zmm11		\n\t"\
		"vsubpd		%%zmm5,%%zmm1,%%zmm1			\n\t		vsubpd		%%zmm15,%%zmm10,%%zmm10		\n\t"\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r9),%%zmm14 	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm4			\n\t	vfmadd132pd		%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm5			\n\t	vfmadd132pd		%%zmm14,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm2,%%zmm7			\n\t	vfmadd132pd		%%zmm14,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm3,%%zmm6			\n\t	vfmadd132pd		(%%rax),%%zmm11,%%zmm14		\n\t"\
		"														vsubpd		%%zmm15,%%zmm11,%%zmm11		\n\t"\
		"														vsubpd		%%zmm10,%%zmm14,%%zmm14		\n\t"\
		"													vfmadd132pd		(%%r9 ),%%zmm11,%%zmm15		\n\t"/* .two */\
		"													vfmadd132pd		(%%r9 ),%%zmm14,%%zmm10		\n\t"\
		/* Outputs 1-7 order-reversed in the SIMD version of this macro: Thus swap output byte-offset pairs */\
		/* 0x[40,60] <-> [1c0,1e0], [80,a0] <-> [180,1a0], [c0,e0] <-> [140,160] : */\
		"movq	%[__o0],%%rax						\n\t		movq	%[__o4],%%r10					\n\t"\
		"movq	%[__o1],%%rbx						\n\t		movq	%[__o5],%%r11					\n\t"\
		"vsubpd		%%zmm9 ,%%zmm0 ,%%zmm0 			\n\t	vfnmadd231pd	(%%rdi),%%zmm14,%%zmm2 		\n\t"/* .isrt2 */\
		"vsubpd		%%zmm13,%%zmm5 ,%%zmm5 			\n\t	vfnmadd231pd	(%%rdi),%%zmm11,%%zmm3 		\n\t"\
		"vsubpd		%%zmm12,%%zmm4 ,%%zmm4 			\n\t	vfnmadd231pd	(%%rdi),%%zmm15,%%zmm7 		\n\t"\
		"vsubpd		%%zmm8 ,%%zmm1 ,%%zmm1 			\n\t	vfnmadd231pd	(%%rdi),%%zmm10,%%zmm6 		\n\t"\
		"movq	%[__o2],%%rcx						\n\t		movq	%[__o6],%%r12					\n\t"\
		"movq	%[__o3],%%rdx						\n\t		movq	%[__o7],%%r13					\n\t"\
	"vmovaps	%%zmm8 ,0x40(%%rax) 	\n\t"/* spill zmm8  to make room for 2.0 */"	vmovaps	 (%%r9),%%zmm8	\n\t"/* two */\
	"vmovaps	%%zmm10,(%%rax) 	\n\t"/* spill zmm10 to make room for sqrt2 */"	vmovaps	0x80(%%r9),%%zmm10	\n\t"/* sqrt2 */\
		"vmovaps	%%zmm0 ,    (%%rcx)				\n\t		vmovaps		%%zmm2 ,    (%%rbx)		\n\t"\
		"vmovaps	%%zmm5 ,0x40(%%r10)				\n\t		vmovaps		%%zmm3 ,0x40(%%rdx)		\n\t"\
		"vmovaps	%%zmm4 ,    (%%r10)				\n\t		vmovaps		%%zmm7 ,    (%%rdx)		\n\t"\
		"vmovaps	%%zmm1 ,0x40(%%r12)				\n\t		vmovaps		%%zmm6 ,0x40(%%r11)		\n\t"\
	"vfmadd132pd		%%zmm8 ,%%zmm0 ,%%zmm9 		\n\t	vfmadd132pd		%%zmm10,%%zmm2 ,%%zmm14		\n\t"\
	"vfmadd132pd		%%zmm8 ,%%zmm5 ,%%zmm13		\n\t	vfmadd132pd		%%zmm10,%%zmm3 ,%%zmm11		\n\t"\
	"vfmadd132pd		%%zmm8 ,%%zmm4 ,%%zmm12		\n\t	vfmadd132pd		%%zmm10,%%zmm7 ,%%zmm15		\n\t"\
	"vfmadd132pd	0x40(%%rax),%%zmm1 ,%%zmm8 		\n\t	vfmadd132pd		(%%rax),%%zmm6 ,%%zmm10		\n\t"\
		"vmovaps	%%zmm9 ,    (%%r12)				\n\t		vmovaps		%%zmm14,    (%%r11)		\n\t"\
		"vmovaps	%%zmm13,0x40(%%rax)				\n\t		vmovaps		%%zmm11,0x40(%%r13)		\n\t"\
		"vmovaps	%%zmm12,    (%%rax)				\n\t		vmovaps		%%zmm15,    (%%r13)		\n\t"\
		"vmovaps	%%zmm8 ,0x40(%%rcx)				\n\t		vmovaps		%%zmm10,0x40(%%rbx)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__two] "m" (Xtwo)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// AVX2 Opcount: 84 vec MEM [30 implicit], 31 ADD/SUB, 50 MUL, 36 FMA, i.e. trade 36 ADD+MUL for 36 FMA (plus one more ADD to generate SQRT2 from ISRT2).
	//*** Dec 2020: For guide to updated reduced-#arg IO address computation, cf. the SSE2 version of this macro. ***
	#define SSE2_RADIX8_DIT_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xo_off, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
	/* i1 is base byte-offset, no need to lshift it prior to add: */\
		"xorq	%%r8,%%r8	\n\t	leaq	%c[i1](%%r8),%%r8	\n\t"/* movq|movslq of literal %c[i1] both segfaulted, workaround via LEA */\
		/* The twid_ptrs[] array holds ptrs to 14 complex twiddles in-order: (c,s)[1,2,3,4,5,6,7]: */\
		"movq	%[twid_ptrs],%%r14	\n\t"\
	/* Block 0/1 has just one twiddle-CMUL: 												/* Blocks 2/3 use separate register subset, can be done overlapped with 0/1: */\
	"movq		%[in0],%%rax		\n\t"\
		"leaq	(%%rax,%%r8  ),%%rbx	\n\t"\
		"leaq	(%%rax,%%r8,2),%%rcx	\n\t											movq	0x10(%%r14),%%r10				\n\t	movq	0x20(%%r14),%%r12			\n\t"/* c2,c3 */\
		"leaq	(%%rcx,%%r8  ),%%rdx	\n\t											movq	0x18(%%r14),%%r11				\n\t	movq	0x28(%%r14),%%r13			\n\t"/* s2,s3 */\
	"movq	    (%%r14),%%rdi	\n\t/* [rdi,rsi] -> [c,s] components of each sincos pair, */	vmovaps		(%%rcx),%%zmm8 		\n\t	vmovaps		0x40(%%rcx),%%zmm9 		\n\t"/* lcol: c1,s1 */\
	"movq	0x08(%%r14),%%rsi	\n\t/* (not truly a pair here in terms of rel-addresses). */	vmovaps	%%zmm9 ,%%zmm10			\n\t	vmovaps		%%zmm8 ,%%zmm11			\n\t"\
	"vmovaps	    (%%rbx),%%zmm4 		\n\t	vmovaps		0x40(%%rbx),%%zmm5 		\n\t	vmovaps		(%%rdx),%%zmm12			\n\t	vmovaps		0x40(%%rdx),%%zmm13		\n\t"\
	"vmovaps	    (%%rax),%%zmm0 		\n\t	vmovaps		0x40(%%rax),%%zmm1 		\n\t	vmovaps	%%zmm13,%%zmm14				\n\t	vmovaps		%%zmm12,%%zmm15			\n\t"\
	"vmovaps	%%zmm5 ,%%zmm6 			\n\t	vmovaps		%%zmm4 ,%%zmm7 			\n\t	vmulpd		(%%r10),%%zmm8 ,%%zmm8 	\n\t	vmulpd		(%%r10),%%zmm9 ,%%zmm9 	\n\t"\
	"vmulpd		 (%%rdi),%%zmm4 ,%%zmm4 \n\t	vmulpd		(%%rdi),%%zmm5 ,%%zmm5 	\n\t	vmulpd		(%%r12),%%zmm12,%%zmm12	\n\t	vmulpd		(%%r12),%%zmm13,%%zmm13	\n\t"\
	"vfmadd231pd (%%rsi),%%zmm6 ,%%zmm4 \n\t   vfnmadd231pd (%%rsi),%%zmm7 ,%%zmm5 	\n\t	vfmadd231pd (%%r11),%%zmm10,%%zmm8 	\n\t   vfnmadd231pd (%%r11),%%zmm11,%%zmm9 	\n\t"\
	"vmovaps	%%zmm0 ,%%zmm2 			\n\t	vmovaps		%%zmm1 ,%%zmm3 			\n\t	vfmadd231pd (%%r13),%%zmm14,%%zmm12	\n\t   vfnmadd231pd (%%r13),%%zmm15,%%zmm13	\n\t"\
	"vaddpd		%%zmm4 ,%%zmm0 ,%%zmm0 	\n\t	vaddpd		%%zmm5 ,%%zmm1 ,%%zmm1 	\n\t	vmovaps		%%zmm8 ,%%zmm10			\n\t	vmovaps		%%zmm9 ,%%zmm11			\n\t"\
	"vsubpd		%%zmm4 ,%%zmm2 ,%%zmm2 	\n\t	vsubpd		%%zmm5 ,%%zmm3 ,%%zmm3 	\n\t	vaddpd		%%zmm12,%%zmm8 ,%%zmm8 	\n\t	vaddpd		%%zmm13,%%zmm9 ,%%zmm9 	\n\t"\
	"vmovaps	%%zmm0 ,    (%%rax)		\n\t	vmovaps		%%zmm1 ,0x40(%%rax)		\n\t	vsubpd		%%zmm12,%%zmm10,%%zmm10	\n\t	vsubpd		%%zmm13,%%zmm11,%%zmm11	\n\t"\
	"vmovaps	%%zmm2 ,    (%%rbx)		\n\t	vmovaps		%%zmm3 ,0x40(%%rbx)		\n\t	vmovaps		%%zmm8 ,    (%%rcx)		\n\t	vmovaps		%%zmm9 ,0x40(%%rcx)		\n\t"\
														/* Now do radix-2 butterfly: */	"	vmovaps		%%zmm10,    (%%rdx)		\n\t	vmovaps		%%zmm11,0x40(%%rdx)		\n\t"\
	/* Blocks 4/5: */																		/* Blocks 6/7 use separate register subset, can be done overlapped with 4/5: */\
	"shlq	$2,%%r8			\n\t"/* From here on only need offset i4 = 4*i1 */\
	"addq	%%r8,%%rax		\n\t"/* Remaining 4 I-address-calculations are in-place += i4, so use ADD, faster than LEA */\
	"addq	%%r8,%%rbx		\n\t"\
	"addq	%%r8,%%rcx		\n\t"\
	"addq	%%r8,%%rdx		\n\t"\
	"vmovaps		(%%rax),%%zmm0 		\n\t	vmovaps		0x40(%%rax),%%zmm1 		\n\t	vmovaps		(%%rcx),%%zmm8 			\n\t	vmovaps		0x40(%%rcx),%%zmm9 		\n\t"\
	"vmovaps	%%zmm1 ,%%zmm2 			\n\t	vmovaps		%%zmm0 ,%%zmm3 			\n\t	vmovaps		%%zmm9 ,%%zmm10			\n\t	vmovaps		%%zmm8 ,%%zmm11			\n\t"\
	"vmovaps		(%%rbx),%%zmm4 		\n\t	vmovaps		0x40(%%rbx),%%zmm5 		\n\t	vmovaps		(%%rdx),%%zmm12			\n\t	vmovaps		0x40(%%rdx),%%zmm13		\n\t"\
	"vmovaps	%%zmm5 ,%%zmm6 			\n\t	vmovaps		%%zmm4 ,%%zmm7 			\n\t	vmovaps		%%zmm13,%%zmm14			\n\t	vmovaps		%%zmm12,%%zmm15			\n\t"\
	"subq		%%r8,%%rax			\n\t"\
	"subq		%%r8,%%rbx			\n\t"\
	"subq		%%r8,%%rcx			\n\t"\
	"subq		%%r8,%%rdx			\n\t"\
	"movq	0x30(%%r14),%%rdi			\n\t												movq	0x50(%%r14),%%r10				\n\t"/* c4,c6 */\
	"movq	0x40(%%r14),%%r8 			\n\t												movq	0x60(%%r14),%%r12				\n\t"/* c5,c7 */\
	"movq	0x38(%%r14),%%rsi			\n\t												movq	0x58(%%r14),%%r11				\n\t"/* s4,s6 */\
	"movq	0x48(%%r14),%%r9 			\n\t												movq	0x68(%%r14),%%r13				\n\t"/* s5,s7 */\
	"vmulpd		 (%%rdi),%%zmm0 ,%%zmm0 \n\t	vmulpd		 (%%rdi),%%zmm1 ,%%zmm1 \n\t	vmulpd		(%%r10),%%zmm8 ,%%zmm8 	\n\t	vmulpd		 (%%r10),%%zmm9 ,%%zmm9 \n\t"\
	"vmulpd		 (%%r8 ),%%zmm4 ,%%zmm4 \n\t	vmulpd		 (%%r8 ),%%zmm5 ,%%zmm5 \n\t	vmulpd		(%%r12),%%zmm12,%%zmm12	\n\t	vmulpd		 (%%r12),%%zmm13,%%zmm13\n\t"\
	"vfmadd231pd (%%rsi),%%zmm2 ,%%zmm0 \n\t	vfnmadd231pd (%%rsi),%%zmm3 ,%%zmm1 \n\t	vfmadd231pd (%%r11),%%zmm10,%%zmm8 	\n\t	vfnmadd231pd (%%r11),%%zmm11,%%zmm9 \n\t"\
	"vfmadd231pd (%%r9 ),%%zmm6 ,%%zmm4 \n\t	vfnmadd231pd (%%r9 ),%%zmm7 ,%%zmm5 \n\t	vfmadd231pd (%%r13),%%zmm14,%%zmm12	\n\t	vfnmadd231pd (%%r13),%%zmm15,%%zmm13\n\t"\
	/* Now do radix-2 butterfly: */\
	"vmovaps	%%zmm0 ,%%zmm2 			\n\t	vmovaps		%%zmm1 ,%%zmm3 			\n\t	vmovaps		%%zmm8 ,%%zmm10			\n\t	vmovaps		%%zmm9 ,%%zmm11			\n\t"\
	"vaddpd		%%zmm4 ,%%zmm0 ,%%zmm0 	\n\t	vaddpd		%%zmm5 ,%%zmm1 ,%%zmm1 	\n\t	vaddpd		%%zmm12,%%zmm8 ,%%zmm8 	\n\t	vaddpd		%%zmm13,%%zmm9 ,%%zmm9 	\n\t"\
	"vsubpd		%%zmm4 ,%%zmm2 ,%%zmm2 	\n\t	vsubpd		%%zmm5 ,%%zmm3 ,%%zmm3 	\n\t	vsubpd		%%zmm12,%%zmm10,%%zmm10	\n\t	vsubpd		%%zmm13,%%zmm11,%%zmm11	\n\t"\
	/* Reload Block 0-3 outputs into r4-7,c-f, combine to get the 2 length-4 subtransform... */\
	"vmovaps		(%%rax),%%zmm4 		\n\t	vmovaps		0x40(%%rax),%%zmm5 		\n\t"\
	"vmovaps		(%%rbx),%%zmm6 		\n\t	vmovaps		0x40(%%rbx),%%zmm7 		\n\t"\
	"vmovaps		(%%rcx),%%zmm12		\n\t	vmovaps		0x40(%%rcx),%%zmm13		\n\t"\
	"vmovaps		(%%rdx),%%zmm14		\n\t	vmovaps		0x40(%%rdx),%%zmm15		\n\t"\
	"movq		%[out0],%%rax			\n\t	movq		%[o_off],%%r8		\n\t"/* out0, off1 */\
	"movq		%[two],%%rsi			\n\t	leaq		(%%r8,%%r8),%%r9	\n\t"/* (vec_dbl)2.0, off2 */\
		"										leaq		(%%r9,%%r9),%%r10	\n\t"/* off4 */\
	"vsubpd		%%zmm12,%%zmm4 ,%%zmm4 	\n\t	vsubpd		%%zmm13,%%zmm5 ,%%zmm5 	\n\t"\
	"vsubpd		%%zmm15,%%zmm6 ,%%zmm6 	\n\t	vsubpd		%%zmm14,%%zmm7 ,%%zmm7 	\n\t"\
	"vsubpd		%%zmm8 ,%%zmm0 ,%%zmm0 	\n\t	vsubpd		%%zmm9 ,%%zmm1 ,%%zmm1 	\n\t"\
	"vsubpd		%%zmm11,%%zmm2 ,%%zmm2 	\n\t	vsubpd		%%zmm10,%%zmm3 ,%%zmm3 	\n\t"\
	/* We hope the microcode execution engine inlines the MULs with the above SUBs: */\
	"vmovaps	%%zmm10,(%%rdx) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%rsi),%%zmm10 \n\t"/* two */\
	"vfmadd132pd %%zmm10,%%zmm4,%%zmm12	\n\t	vfmadd132pd %%zmm10,%%zmm5 ,%%zmm13	\n\t"\
	"vfmadd132pd %%zmm10,%%zmm6,%%zmm15	\n\t	vfmadd132pd %%zmm10,%%zmm7 ,%%zmm14	\n\t"\
	"vfmadd132pd %%zmm10,%%zmm0,%%zmm8 	\n\t	vfmadd132pd %%zmm10,%%zmm1 ,%%zmm9 	\n\t"\
	"vfmadd132pd %%zmm10,%%zmm2,%%zmm11	\n\t	vfmadd132pd (%%rdx),%%zmm3 ,%%zmm10	\n\t"\
	/* In terms of our original scalar-code prototyping macro, the data are: __tr0 = _r[c,f,4,6,8,b,0,2], __ti0 = _r[d,7,5,e,9,3,1,a]; */\
	/* Now combine the two half-transforms: */\
	/* Need r2/3+- a/b combos for the *ISRT2 preceding the output 4-7 radix-2 butterflies, so start them first: */\
	"vsubpd		%%zmm3 ,%%zmm11,%%zmm11	\n\t	vsubpd		%%zmm10,%%zmm2 ,%%zmm2 	\n\t"\
	"vsubpd		%%zmm8 ,%%zmm12,%%zmm12	\n\t	vsubpd		%%zmm9 ,%%zmm13,%%zmm13	\n\t"\
	"vsubpd		%%zmm1 ,%%zmm4 ,%%zmm4 	\n\t	vsubpd		%%zmm0 ,%%zmm5 ,%%zmm5 	\n\t"\
	"vmovaps	%%zmm0 ,(%%rdx) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%rsi),%%zmm0  \n\t"/* two */\
	"vfmadd132pd %%zmm0,%%zmm11,%%zmm3 	\n\t	vfmadd132pd %%zmm0 ,%%zmm2 ,%%zmm10	\n\t"\
	"vfmadd132pd %%zmm0,%%zmm12,%%zmm8 	\n\t	vfmadd132pd %%zmm0 ,%%zmm13,%%zmm9 	\n\t"\
	"vfmadd132pd %%zmm0,%%zmm4 ,%%zmm1 	\n\t	vfmadd132pd (%%rdx),%%zmm5 ,%%zmm0 	\n\t"\
	/*movq		%[o0],%%rax		[o0] already in rax */	\
	"leaq	(%%rax,%%r9 ),%%rcx		\n\t"/* out0 + off2, compute first to allow time for LEA to finish before += off4 to get out0 + off6 */\
	"leaq	(%%rax,%%r10),%%rbx		\n\t"/* out0 + off4 */\
	"leaq	(%%rcx,%%r10),%%rdx		\n\t"/* out0 + off6 */\
	"vmovaps	%%zmm12,    (%%rbx)		\n\t	vmovaps		%%zmm13,0x40(%%rbx)		\n\t"/* __Br1 = _rc;	__Bi1 = _rd; */\
	/* Use that _rc,d free to stick ISRT2 into _rc and SQRT2 into _rd: */\
	"vmovaps	-0x40(%%rdi),%%zmm12	\n\t	vaddpd	%%zmm12,%%zmm12,%%zmm13		\n\t"/* zmm12 = ISRT2;	zmm13 = SQRT2; */\
	"vmovaps	%%zmm4 ,    (%%rdx)		\n\t	vmovaps		%%zmm0 ,0x40(%%rdx)		\n\t"/* __Br3 = _r4;	__Bi3 = _r0; */\
	"vmovaps	%%zmm8 ,    (%%rax)		\n\t	vmovaps		%%zmm9 ,0x40(%%rax)		\n\t"/* __Br0 = _r8;	__Bi0 = _r9; */\
	"vmovaps	%%zmm1 ,    (%%rcx)		\n\t	vmovaps		%%zmm5 ,0x40(%%rcx)		\n\t"/* __Br2 = _r1;	__Bi2 = _r5; */\
	"vfnmadd231pd %%zmm12,%%zmm3,%%zmm15\n\t	vfnmadd231pd %%zmm12,%%zmm11,%%zmm7 \n\t"\
	"vfnmadd231pd %%zmm12,%%zmm2,%%zmm6	\n\t	vfnmadd231pd %%zmm12,%%zmm10,%%zmm14\n\t"\
	" vfmadd132pd %%zmm13,%%zmm15,%%zmm3\n\t	 vfmadd132pd %%zmm13,%%zmm7 ,%%zmm11\n\t"\
	" vfmadd132pd %%zmm13,%%zmm6 ,%%zmm2\n\t	 vfmadd132pd %%zmm13,%%zmm14,%%zmm10\n\t"\
	"addq		%%r8 ,%%rax			\n\t"/* out0 + off[1,5,3,7] */\
	"addq		%%r8 ,%%rbx			\n\t"\
	"addq		%%r8 ,%%rcx			\n\t"\
	"addq		%%r8 ,%%rdx			\n\t"\
	"vmovaps	%%zmm3 ,    (%%rax)		\n\t	vmovaps		%%zmm7 ,0x40(%%rax)		\n\t"/* __Br4 = _r3;	__Bi4 = _r7; */\
	"vmovaps	%%zmm15,    (%%rbx)		\n\t	vmovaps		%%zmm11,0x40(%%rbx)		\n\t"/* __Br5 = _rf;	__Bi5 = _rb; */\
	"vmovaps	%%zmm6 ,    (%%rcx)		\n\t	vmovaps		%%zmm14,0x40(%%rcx)		\n\t"/* __Br6 = _r6;	__Bi6 = _re; */\
	"vmovaps	%%zmm2 ,    (%%rdx)		\n\t	vmovaps		%%zmm10,0x40(%%rdx)		\n\t"/* __Br7 = _r2;	__Bi7 = _ra; */\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "e" (Xi1)	/* ...except for 'e'-inputs which are literal byte offsets */\
		 ,[out0] "m" (Xout0)\
		 ,[o_off] "m" (Xo_off)/* O-address pointer-stride */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

/*** Prefetch odd-index iaddresses in DIF below, even-index oaddresses in SSE2_RADIX16_DIF_TWIDDLE_OOP ***/

	// Based on the SSE2_RADIX16_DIF_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-output addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	/* Dec 2020: Needed to cut #args for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid
	multiple versions of the macro having different arglists. Replace 16 O-addresses with O-base-address
	out0 and pointer to array of 16 int offset-indices: */
	#define SSE2_RADIX16_DIF_0TWIDDLE(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0,Xoff)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%r15	\n\t"/* two, used for FMA-based double-and-ADD/SUBs */\
	/* Block 0: SSE2_RADIX4_DIF_IN_PLACE(r1 , r17, r9 , r25): */	/* Block 2: SSE2_RADIX4_DIF_IN_PLACE(r5 , r21, r13, r29): */\
	"movq	%[__in0],%%rax	\n\t"/* Note BR of r[abcd]x: b<-->c */	"	leaq	%c[__i2](%%rax),%%r10	\n\t"/* addr += 2*ostride */\
	"leaq	%c[__i4](%%rax),%%rcx	\n\t"/* __in0+  [4*istride] */	"	leaq	%c[__i2](%%rcx),%%r12	\n\t"/* w.r.to to Block 0 */\
	"leaq	%c[__i4](%%rcx),%%rbx	\n\t"/* __in0+2*[4*istride] */	"	leaq	%c[__i2](%%rbx),%%r11	\n\t"/* Note BR of r1[0123]: r11<-->r12 */\
	"leaq	%c[__i4](%%rbx),%%rdx	\n\t"/* __in0+3*[4*istride] */	"	leaq	%c[__i2](%%rdx),%%r13	\n\t"\
		"vmovaps	    (%%rbx),%%zmm0							\n\t	vmovaps	    (%%r11),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm1							\n\t	vmovaps	0x40(%%r11),%%zmm9 	\n\t"\
		"vmovaps	    (%%rax),%%zmm2							\n\t	vmovaps	    (%%r10),%%zmm10	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm3							\n\t	vmovaps	0x40(%%r10),%%zmm11	\n\t"\
		"vmovaps	    (%%rdx),%%zmm4							\n\t	vmovaps	    (%%r13),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5							\n\t	vmovaps	0x40(%%r13),%%zmm13	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6							\n\t	vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7							\n\t	vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vsubpd		%%zmm0 ,%%zmm2,%%zmm2						\n\t	vsubpd		%%zmm8 ,%%zmm10,%%zmm10	\n\t"\
		"vsubpd		%%zmm1 ,%%zmm3,%%zmm3						\n\t	vsubpd		%%zmm9 ,%%zmm11,%%zmm11	\n\t"\
		"vsubpd		%%zmm4 ,%%zmm6,%%zmm6						\n\t	vsubpd		%%zmm12,%%zmm14,%%zmm14	\n\t"\
		"vsubpd		%%zmm5 ,%%zmm7,%%zmm7						\n\t	vsubpd		%%zmm13,%%zmm15,%%zmm15	\n\t"\
	"vmovaps	%%zmm13,(%%rax) 	\n\t"/* spill zmm13 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm13	\n\t"/* two */\
	"vfmadd132pd	%%zmm13,%%zmm2,%%zmm0						\n\t	vfmadd132pd	%%zmm13,%%zmm10,%%zmm8 	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm3,%%zmm1						\n\t	vfmadd132pd	%%zmm13,%%zmm11,%%zmm9 	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm6,%%zmm4						\n\t	vfmadd132pd	%%zmm13,%%zmm14,%%zmm12	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm7,%%zmm5						\n\t	vfmadd132pd	(%%rax),%%zmm15,%%zmm13	\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm15,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm14,%%zmm11,%%zmm11		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)							\n\t	vmovaps	%%zmm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)							\n\t	vmovaps	%%zmm9 ,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm2,    (%%rcx)							\n\t	vmovaps	%%zmm10,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)							\n\t	vmovaps	%%zmm11,0x40(%%r13)	\n\t"\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm14	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm4						\n\t	vfmadd132pd	%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm5						\n\t	vfmadd132pd	%%zmm14,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm2,%%zmm7						\n\t	vfmadd132pd	%%zmm14,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm3,%%zmm6						\n\t	vfmadd132pd	(%%rax),%%zmm11,%%zmm14		\n\t"\
		"vmovaps	%%zmm4,    (%%rax)							\n\t	vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)							\n\t	vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm7,    (%%rdx)							\n\t	vmovaps	%%zmm15,    (%%r13)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%rcx)							\n\t	vmovaps	%%zmm14,0x40(%%r12)	\n\t"\
	/* Block 1: SSE2_RADIX4_DIF_IN_PLACE(r3 , r19, r11, r27): */	/* Block 3: SSE2_RADIX4_DIF_IN_PLACE(r7 , r23, r15, r31): */\
		"addq	$%c[__i1],%%rax	\n\t"/* addr += 1*ostride */"	addq	$%c[__i1],%%r10	\n\t"/* addr += 1*ostride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* w.r.to to Block 0 */"	addq	$%c[__i1],%%r12	\n\t"/* w.r.to to Block 2 */\
		"addq	$%c[__i1],%%rcx							\n\t	addq	$%c[__i1],%%r11	\n\t"\
		"addq	$%c[__i1],%%rdx							\n\t	addq	$%c[__i1],%%r13	\n\t"\
		"vmovaps	    (%%rbx),%%zmm0							\n\t	vmovaps	    (%%r11),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm1							\n\t	vmovaps	0x40(%%r11),%%zmm9 	\n\t"\
		"vmovaps	    (%%rax),%%zmm2							\n\t	vmovaps	    (%%r10),%%zmm10	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm3							\n\t	vmovaps	0x40(%%r10),%%zmm11	\n\t"\
		"vmovaps	    (%%rdx),%%zmm4							\n\t	vmovaps	    (%%r13),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5							\n\t	vmovaps	0x40(%%r13),%%zmm13	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6							\n\t	vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7							\n\t	vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vsubpd		%%zmm0 ,%%zmm2,%%zmm2						\n\t	vsubpd		%%zmm8 ,%%zmm10,%%zmm10	\n\t"\
		"vsubpd		%%zmm1 ,%%zmm3,%%zmm3						\n\t	vsubpd		%%zmm9 ,%%zmm11,%%zmm11	\n\t"\
		"vsubpd		%%zmm4 ,%%zmm6,%%zmm6						\n\t	vsubpd		%%zmm12,%%zmm14,%%zmm14	\n\t"\
		"vsubpd		%%zmm5 ,%%zmm7,%%zmm7						\n\t	vsubpd		%%zmm13,%%zmm15,%%zmm15	\n\t"\
	"vmovaps	%%zmm13,(%%rax) 	\n\t"/* spill zmm13 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm13	\n\t"/* two */\
	"vfmadd132pd	%%zmm13,%%zmm2,%%zmm0						\n\t	vfmadd132pd	%%zmm13,%%zmm10,%%zmm8 	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm3,%%zmm1						\n\t	vfmadd132pd	%%zmm13,%%zmm11,%%zmm9 	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm6,%%zmm4						\n\t	vfmadd132pd	%%zmm13,%%zmm14,%%zmm12	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm7,%%zmm5						\n\t	vfmadd132pd	(%%rax),%%zmm15,%%zmm13	\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm15,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm14,%%zmm11,%%zmm11		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)							\n\t	vmovaps	%%zmm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)							\n\t	vmovaps	%%zmm9 ,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm2,    (%%rcx)							\n\t	vmovaps	%%zmm10,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)							\n\t	vmovaps	%%zmm11,0x40(%%r13)	\n\t"\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm14	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm4						\n\t	vfmadd132pd	%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm5						\n\t	vfmadd132pd	%%zmm14,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm2,%%zmm7						\n\t	vfmadd132pd	%%zmm14,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm3,%%zmm6						\n\t	vfmadd132pd	(%%rax),%%zmm11,%%zmm14		\n\t"\
		"vmovaps	%%zmm4,    (%%rax)							\n\t	vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)							\n\t	vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm7,    (%%rdx)							\n\t	vmovaps	%%zmm15,    (%%r13)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%rcx)							\n\t	vmovaps	%%zmm14,0x40(%%r12)	\n\t"\
	/*****************************************************************************************
	**** Now do 4 DFTs with internal twiddles on the 1*stride - separated data. Do blocks ****
	**** in order 0,2,1,3 to allow increment-only of rsi-datum from 1 block to the next:  ****
	*****************************************************************************************/\
	/* Problem: In the sse2 and avx versions of the reduced-#args macro with their single columns of
	instructions, had plenty of GPRs to store both I and O-addresses simultaneously. In the 2-column avx2
	and avx-512 versions, don't have enough GPRs. But since we don't need the O-addresses until we are ready
	to write outputs, just move the O-address computations down to that part of each 4-DFT sub-block: */\
	/* Block 0: r0-3 */												/* Block 1: r8-b */\
		"movq	%[__in0],%%rsi	\n\t	leaq %c[__i4](%%rsi),%%r8 \n\t addq $%c[__i4],%%r8 \n\t"/* __in0+[0,8]*ostride */\
	/* Need separate address for Im parts of outputs due to literal-offsets below: */\
		"leaq	0x40(%%rsi),%%rdi								\n\t	leaq	0x40(%%r8 ),%%r9 	\n\t"\
		"vmovaps	        (%%rsi),%%zmm0						\n\t	vmovaps	        (%%r8 ),%%zmm8 	\n\t"/* ar */\
		"vmovaps	        (%%rdi),%%zmm1						\n\t	vmovaps	        (%%r9 ),%%zmm9 	\n\t"/* ai */\
		"vmovaps	%c[__i2](%%rsi),%%zmm2						\n\t	vmovaps	%c[__i2](%%r8 ),%%zmm10	\n\t"/* br */\
		"vmovaps	%c[__i2](%%rdi),%%zmm3						\n\t	vmovaps	%c[__i2](%%r9 ),%%zmm11	\n\t"/* bi */\
		"vmovaps	%c[__i1](%%rsi),%%zmm4						\n\t	vmovaps	%c[__i1](%%r8 ),%%zmm12	\n\t"/* cr */\
		"vmovaps	%c[__i1](%%rdi),%%zmm5						\n\t	vmovaps	%c[__i1](%%r9 ),%%zmm13	\n\t"/* ci */\
		"vmovaps	%c[__i3](%%rsi),%%zmm6						\n\t	vmovaps	%c[__i3](%%r8 ),%%zmm14	\n\t"/* dr */\
		"vmovaps	%c[__i3](%%rdi),%%zmm7						\n\t	vmovaps	%c[__i3](%%r9 ),%%zmm15	\n\t"/* di */\
		"																movq	%[__isrt2],%%r14	\n\t"\
		"vsubpd		%%zmm2 ,%%zmm0,%%zmm0						\n\t	vsubpd		%%zmm11,%%zmm8 ,%%zmm8 	\n\t"/* ar-bi */\
		"vsubpd		%%zmm3 ,%%zmm1,%%zmm1						\n\t	vsubpd		%%zmm10,%%zmm9 ,%%zmm9 	\n\t"/* ai-br */\
		"vsubpd		%%zmm6 ,%%zmm4,%%zmm4						\n\t	vsubpd		%%zmm13,%%zmm12,%%zmm12	\n\t"/* cr-ci */\
		"vsubpd		%%zmm7 ,%%zmm5,%%zmm5						\n\t	vsubpd		%%zmm14,%%zmm15,%%zmm15	\n\t"/* di-dr */\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm14	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm2						\n\t	vfmadd132pd	%%zmm14,%%zmm8 ,%%zmm11	\n\t"/* ar+bi */\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm3						\n\t	vfmadd132pd	%%zmm14,%%zmm9 ,%%zmm10	\n\t"/* ai+br */\
	"vfmadd132pd	%%zmm14,%%zmm4,%%zmm6						\n\t	vfmadd132pd	%%zmm14,%%zmm12,%%zmm13	\n\t"/* cr+ci */\
	"vfmadd132pd	%%zmm14,%%zmm5,%%zmm7						\n\t	vfmadd132pd	(%%rax),%%zmm15,%%zmm14	\n\t"/* di+dr */\
		"																	vsubpd	%%zmm14,%%zmm12,%%zmm12		\n\t"\
		"																	vsubpd	%%zmm15,%%zmm13,%%zmm13		\n\t"\
	"movq	%[out0],%%r8	\n\t	movq	%[off],%%r9	\n\t"/* Load output base-address into r8 and offset-array pointer into r9 */\
		"movslq		    (%%r9),%%rax	\n\t"/*        off0 */"movslq	0x10(%%r9),%%r10	\n\t"/*        off4 */\
		"movslq		0x04(%%r9),%%rbx	\n\t"/*        off1 */"movslq	0x14(%%r9),%%r11	\n\t"/*        off5 */\
		"movslq		0x08(%%r9),%%rcx	\n\t"/*        off2 */"movslq	0x18(%%r9),%%r12	\n\t"/*        off6 */\
		"movslq		0x0c(%%r9),%%rdx	\n\t"/*        off3 */"movslq	0x1c(%%r9),%%r13	\n\t"/*        off7 */\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* out0 + off0 */"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off4 */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"/* out0 + off1 */"leaq	(%%r8,%%r11,8),%%r11	\n\t"/* out0 + off5 */\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"/* out0 + off2 */"leaq	(%%r8,%%r12,8),%%r12	\n\t"/* out0 + off6 */\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"/* out0 + off3 */"leaq	(%%r8,%%r13,8),%%r13	\n\t"/* out0 + off7 */\
		"vsubpd		%%zmm6,%%zmm2,%%zmm2						\n\t	vfmadd132pd	(%%r15),%%zmm12,%%zmm14		\n\t"\
		"vsubpd		%%zmm7,%%zmm3,%%zmm3						\n\t	vfmadd132pd	(%%r15),%%zmm13,%%zmm15		\n\t"\
		"vsubpd		%%zmm5,%%zmm0,%%zmm0						\n\t	vfnmadd231pd	(%%r14),%%zmm12,%%zmm8 	\n\t"/* x = x - y.isrt2 */\
		"vsubpd		%%zmm4,%%zmm1,%%zmm1						\n\t	vfnmadd231pd	(%%r14),%%zmm13,%%zmm10	\n\t"\
		"vmovaps	%%zmm2,    (%%rbx)							\n\t	vfnmadd231pd	(%%r14),%%zmm14,%%zmm9 	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rbx)							\n\t	vfnmadd231pd	(%%r14),%%zmm15,%%zmm11	\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)							\n\t	vmovaps	%%zmm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rdx)							\n\t	vmovaps	%%zmm10,0x40(%%r11)	\n\t"\
	"vfmadd132pd	(%%r15),%%zmm2,%%zmm6						\n\t	vmovaps	%%zmm9 ,0x40(%%r13)	\n\t"\
	"vfmadd132pd	(%%r15),%%zmm3,%%zmm7						\n\t	vmovaps	%%zmm11,    (%%r12)	\n\t"\
	"vfmadd132pd	(%%r15),%%zmm0,%%zmm5						\n\t	vfmadd132pd	-0x40(%%r14),%%zmm8 ,%%zmm12	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	"vfmadd132pd	(%%r15),%%zmm1,%%zmm4						\n\t	vfmadd132pd	-0x40(%%r14),%%zmm10,%%zmm13	\n\t"\
		"vmovaps	%%zmm6,    (%%rax)							\n\t	vfmadd132pd	-0x40(%%r14),%%zmm11,%%zmm15	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)							\n\t	vfmadd132pd	-0x40(%%r14),%%zmm9 ,%%zmm14	\n\t"\
		"vmovaps	%%zmm5,    (%%rdx)							\n\t	vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rcx)							\n\t	vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"																vmovaps	%%zmm15,    (%%r13)	\n\t"\
		"																vmovaps	%%zmm14,0x40(%%r12)	\n\t"\
	/* Block 2: */													/* Block 3: */\
		"addq	$%c[__i4],%%rsi	\n\t	leaq %c[__i4](%%rsi),%%r8 \n\t addq $%c[__i4],%%r8 \n\t"/* __in0+[4,c]*ostride */\
		"leaq	0x40(%%rsi),%%rdi								\n\t	leaq	0x40(%%r8 ),%%r9 	\n\t"\
		"vmovaps	%c[__i1](%%rsi),%%zmm4						\n\t	vmovaps	%c[__i1](%%r8 ),%%zmm12	\n\t"\
		"vmovaps	%c[__i3](%%rsi),%%zmm6						\n\t	vmovaps	%c[__i3](%%r8 ),%%zmm14	\n\t"\
		"vmovaps	%c[__i1](%%rdi),%%zmm5						\n\t	vmovaps	%c[__i1](%%r9 ),%%zmm13	\n\t"\
		"vmovaps	%c[__i3](%%rdi),%%zmm7						\n\t	vmovaps	%c[__i3](%%r9 ),%%zmm15	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t	addq	$0x40,%%rdi	\n\t"/* cc0, from isrt2 [rdi,rsi shared by both cols] */\
		"vmovaps	%%zmm4,%%zmm0								\n\t	vmovaps	%%zmm12,%%zmm8 		\n\t"\
	/*	"vmovaps	%%zmm6,%%zmm2								\n\t	vmovaps	%%zmm14,%%zmm10		\n\t"*/\
		"vmovaps	(%%rdi),%%zmm2								\n\t	vmovaps	0x40(%%rdi),%%zmm10	\n\t"/* Instead use these to store [c,s] */\
		"vmovaps	%%zmm5,%%zmm1								\n\t	vmovaps	%%zmm13,%%zmm9 		\n\t"\
		"vmovaps	%%zmm7,%%zmm3								\n\t	vmovaps	%%zmm15,%%zmm11		\n\t"\
		"vmulpd		    %%zmm2 ,%%zmm4,%%zmm4					\n\t	vmulpd		    %%zmm10,%%zmm12,%%zmm12	\n\t"\
		"vmulpd		    %%zmm2 ,%%zmm5,%%zmm5					\n\t	vmulpd		    %%zmm10,%%zmm13,%%zmm13	\n\t"\
		"vmulpd		    %%zmm10,%%zmm6,%%zmm6					\n\t	vmulpd		    %%zmm2 ,%%zmm14,%%zmm14	\n\t"\
		"vmulpd		    %%zmm10,%%zmm7,%%zmm7					\n\t	vmulpd		    %%zmm2 ,%%zmm15,%%zmm15	\n\t"\
	"vfnmadd231pd	    %%zmm10,%%zmm1,%%zmm4				\n\t	vfnmadd231pd	    %%zmm2 ,%%zmm9 ,%%zmm12		\n\t"\
	" vfmadd231pd	    %%zmm10,%%zmm0,%%zmm5				\n\t	 vfmadd231pd	    %%zmm2 ,%%zmm8 ,%%zmm13		\n\t"\
	"vfnmadd231pd	    %%zmm2 ,%%zmm3,%%zmm6				\n\t	vfnmadd231pd	    %%zmm10,%%zmm11,%%zmm14		\n\t"\
	" vfmadd231pd %c[__i3](%%rsi),%%zmm2,%%zmm7				\n\t	 vfmadd231pd %c[__i3](%%r8 ),%%zmm10,%%zmm15	\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4							\n\t	vsubpd	%%zmm14,%%zmm12,%%zmm12		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5							\n\t	vsubpd	%%zmm15,%%zmm13,%%zmm13		\n\t"\
	"vfmadd132pd	(%%r15),%%zmm4,%%zmm6						\n\t	vfmadd132pd	(%%r15),%%zmm12,%%zmm14		\n\t"\
	"vfmadd132pd	(%%r15),%%zmm5,%%zmm7						\n\t	vfmadd132pd	(%%r15),%%zmm13,%%zmm15		\n\t"\
		"leaq	0x40(%%rsi),%%rdi								\n\t	leaq	0x40(%%r8 ),%%r9 	\n\t"\
		"vmovaps	%c[__i2](%%rsi),%%zmm2						\n\t	vmovaps	%c[__i2](%%r8 ),%%zmm10	\n\t"\
		"vmovaps	%c[__i2](%%rdi),%%zmm3						\n\t	vmovaps	%c[__i2](%%r9 ),%%zmm11	\n\t"\
		"vmovaps	        (%%rsi),%%zmm0						\n\t	vmovaps	        (%%r8 ),%%zmm8 	\n\t"\
		"vmovaps	    0x40(%%rsi),%%zmm1						\n\t	vmovaps	    0x40(%%r8 ),%%zmm9 	\n\t"\
		"vsubpd		  %%zmm3,%%zmm2,%%zmm2						\n\t	vaddpd	%%zmm11,%%zmm10,%%zmm10	\n\t"\
		"vaddpd	%c[__i2](%%rsi),%%zmm3,%%zmm3					\n\t	vsubpd	%c[__i2](%%r8 ),%%zmm11,%%zmm11	\n\t"\
		"movq	%[__isrt2],%%r9 	\n\t"\
	"vfnmadd231pd		 (%%r9),%%zmm2,%%zmm0				\n\t	vfnmadd231pd		 (%%r9),%%zmm10,%%zmm8 	\n\t"/* x = x - y.isrt2 */\
	"vfnmadd231pd		 (%%r9),%%zmm3,%%zmm1				\n\t	vfnmadd231pd		 (%%r9),%%zmm11,%%zmm9 	\n\t"\
	" vfmadd132pd	-0x40(%%r9),%%zmm0,%%zmm2				\n\t	 vfmadd132pd	-0x40(%%r9),%%zmm8 ,%%zmm10	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	" vfmadd132pd	-0x40(%%r9),%%zmm1,%%zmm3				\n\t	 vfmadd132pd	-0x40(%%r9),%%zmm9 ,%%zmm11	\n\t"\
	"movq	%[out0],%%r8	\n\t	movq	%[off],%%r9	\n\t"/* Load output base-address into r8 and offset-array pointer into r9 */\
		"movslq		0x20(%%r9),%%rax	\n\t"/*        off8 */"movslq	0x30(%%r9),%%r10	\n\t"/*        offc */\
		"movslq		0x24(%%r9),%%rbx	\n\t"/*        off9 */"movslq	0x34(%%r9),%%r11	\n\t"/*        offd */\
		"movslq		0x28(%%r9),%%rcx	\n\t"/*        offa */"movslq	0x38(%%r9),%%r12	\n\t"/*        offe */\
		"movslq		0x2c(%%r9),%%rdx	\n\t"/*        offb */"movslq	0x3c(%%r9),%%r13	\n\t"/*        offf */\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* out0 + off8 */"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + offc */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"/* out0 + off9 */"leaq	(%%r8,%%r11,8),%%r11	\n\t"/* out0 + offd */\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"/* out0 + offa */"leaq	(%%r8,%%r12,8),%%r12	\n\t"/* out0 + offe */\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"/* out0 + offb */"leaq	(%%r8,%%r13,8),%%r13	\n\t"/* out0 + offf */\
		"vsubpd	%%zmm6,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm7,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd	%%zmm5,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm15,%%zmm10,%%zmm10	\n\t"\
		"vsubpd	%%zmm4,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm14,%%zmm11,%%zmm11	\n\t"\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm14	\n\t"/* two */\
		"vmovaps	%%zmm2,    (%%rbx)							\n\t	vmovaps	%%zmm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rbx)							\n\t	vmovaps	%%zmm9 ,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)							\n\t	vmovaps	%%zmm10,    (%%r12)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rdx)							\n\t	vmovaps	%%zmm11,0x40(%%r13)	\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm2,%%zmm6						\n\t	vfmadd132pd	%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm3,%%zmm7						\n\t	vfmadd132pd	%%zmm14,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm5						\n\t	vfmadd132pd	%%zmm14,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm4						\n\t	vfmadd132pd	(%%rax),%%zmm11,%%zmm14		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)							\n\t	vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)							\n\t	vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm5,    (%%rdx)							\n\t	vmovaps	%%zmm15,    (%%r13)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rcx)							\n\t	vmovaps	%%zmm14,0x40(%%r12)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "e" (Xi1)\
		,[__i2] "e" (Xi2)\
		,[__i3] "e" (Xi3)\
		,[__i4] "e" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// Same as above, but with specifiable I-addresses and regularly spaced O-addresses:
	//
	#define SSE2_RADIX16_DIF_0TWIDDLE_B(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%r15	\n\t"/* two, used for FMA-based double-and-ADD/SUBs */\
	/* Block 0: SSE2_RADIX4_DIF_IN_PLACE(r1 , r17, r9 , r25): */	/* Block 2: SSE2_RADIX4_DIF_IN_PLACE(r5 , r21, r13, r29): */\
	"movq	%[__in0],%%rax	\n\t"/* Note BR of r[abcd]x: b<-->c */	"	leaq	%c[__i2](%%rax),%%r10	\n\t"/* addr += 2*ostride */\
	"leaq	%c[__i4](%%rax),%%rcx	\n\t"/* __in0+  [4*istride] */	"	leaq	%c[__i2](%%rcx),%%r12	\n\t"/* w.r.to to Block 0 */\
	"leaq	%c[__i4](%%rcx),%%rbx	\n\t"/* __in0+2*[4*istride] */	"	leaq	%c[__i2](%%rbx),%%r11	\n\t"/* Note BR of r1[0123]: r11<-->r12 */\
	"leaq	%c[__i4](%%rbx),%%rdx	\n\t"/* __in0+3*[4*istride] */	"	leaq	%c[__i2](%%rdx),%%r13	\n\t"\
		"vmovaps	    (%%rbx),%%zmm0							\n\t	vmovaps	    (%%r11),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm1							\n\t	vmovaps	0x40(%%r11),%%zmm9 	\n\t"\
		"vmovaps	    (%%rax),%%zmm2							\n\t	vmovaps	    (%%r10),%%zmm10	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm3							\n\t	vmovaps	0x40(%%r10),%%zmm11	\n\t"\
		"vmovaps	    (%%rdx),%%zmm4							\n\t	vmovaps	    (%%r13),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5							\n\t	vmovaps	0x40(%%r13),%%zmm13	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6							\n\t	vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7							\n\t	vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vsubpd		%%zmm0 ,%%zmm2,%%zmm2						\n\t	vsubpd		%%zmm8 ,%%zmm10,%%zmm10	\n\t"\
		"vsubpd		%%zmm1 ,%%zmm3,%%zmm3						\n\t	vsubpd		%%zmm9 ,%%zmm11,%%zmm11	\n\t"\
		"vsubpd		%%zmm4 ,%%zmm6,%%zmm6						\n\t	vsubpd		%%zmm12,%%zmm14,%%zmm14	\n\t"\
		"vsubpd		%%zmm5 ,%%zmm7,%%zmm7						\n\t	vsubpd		%%zmm13,%%zmm15,%%zmm15	\n\t"\
	"vmovaps	%%zmm13,(%%rax) 	\n\t"/* spill zmm13 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm13	\n\t"/* two */\
	"vfmadd132pd	%%zmm13,%%zmm2,%%zmm0						\n\t	vfmadd132pd	%%zmm13,%%zmm10,%%zmm8 	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm3,%%zmm1						\n\t	vfmadd132pd	%%zmm13,%%zmm11,%%zmm9 	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm6,%%zmm4						\n\t	vfmadd132pd	%%zmm13,%%zmm14,%%zmm12	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm7,%%zmm5						\n\t	vfmadd132pd	(%%rax),%%zmm15,%%zmm13	\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm15,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm14,%%zmm11,%%zmm11		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)							\n\t	vmovaps	%%zmm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)							\n\t	vmovaps	%%zmm9 ,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm2,    (%%rcx)							\n\t	vmovaps	%%zmm10,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)							\n\t	vmovaps	%%zmm11,0x40(%%r13)	\n\t"\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm14	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm4						\n\t	vfmadd132pd	%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm5						\n\t	vfmadd132pd	%%zmm14,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm2,%%zmm7						\n\t	vfmadd132pd	%%zmm14,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm3,%%zmm6						\n\t	vfmadd132pd	(%%rax),%%zmm11,%%zmm14		\n\t"\
		"vmovaps	%%zmm4,    (%%rax)							\n\t	vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)							\n\t	vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm7,    (%%rdx)							\n\t	vmovaps	%%zmm15,    (%%r13)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%rcx)							\n\t	vmovaps	%%zmm14,0x40(%%r12)	\n\t"\
	/* Block 1: SSE2_RADIX4_DIF_IN_PLACE(r3 , r19, r11, r27): */	/* Block 3: SSE2_RADIX4_DIF_IN_PLACE(r7 , r23, r15, r31): */\
		"leaq	%c[__i1](%%rax),%%rax	\n\t"/* addr += 1*ostride */"	leaq	%c[__i2](%%rax),%%r10	\n\t"/* addr += 2*ostride */\
		"leaq	%c[__i1](%%rbx),%%rbx	\n\t"/* w.r.to to Block 0 */"	leaq	%c[__i2](%%rbx),%%r11	\n\t"/* w.r.to to Block 1 */\
		"leaq	%c[__i1](%%rcx),%%rcx							\n\t	leaq	%c[__i2](%%rcx),%%r12	\n\t"\
		"leaq	%c[__i1](%%rdx),%%rdx							\n\t	leaq	%c[__i2](%%rdx),%%r13	\n\t"\
		"vmovaps	    (%%rbx),%%zmm0							\n\t	vmovaps	    (%%r11),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm1							\n\t	vmovaps	0x40(%%r11),%%zmm9 	\n\t"\
		"vmovaps	    (%%rax),%%zmm2							\n\t	vmovaps	    (%%r10),%%zmm10	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm3							\n\t	vmovaps	0x40(%%r10),%%zmm11	\n\t"\
		"vmovaps	    (%%rdx),%%zmm4							\n\t	vmovaps	    (%%r13),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5							\n\t	vmovaps	0x40(%%r13),%%zmm13	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6							\n\t	vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7							\n\t	vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vsubpd		%%zmm0 ,%%zmm2,%%zmm2						\n\t	vsubpd		%%zmm8 ,%%zmm10,%%zmm10	\n\t"\
		"vsubpd		%%zmm1 ,%%zmm3,%%zmm3						\n\t	vsubpd		%%zmm9 ,%%zmm11,%%zmm11	\n\t"\
		"vsubpd		%%zmm4 ,%%zmm6,%%zmm6						\n\t	vsubpd		%%zmm12,%%zmm14,%%zmm14	\n\t"\
		"vsubpd		%%zmm5 ,%%zmm7,%%zmm7						\n\t	vsubpd		%%zmm13,%%zmm15,%%zmm15	\n\t"\
	"vmovaps	%%zmm13,(%%rax) 	\n\t"/* spill zmm13 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm13	\n\t"/* two */\
	"vfmadd132pd	%%zmm13,%%zmm2,%%zmm0						\n\t	vfmadd132pd	%%zmm13,%%zmm10,%%zmm8 	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm3,%%zmm1						\n\t	vfmadd132pd	%%zmm13,%%zmm11,%%zmm9 	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm6,%%zmm4						\n\t	vfmadd132pd	%%zmm13,%%zmm14,%%zmm12	\n\t"\
	"vfmadd132pd	%%zmm13,%%zmm7,%%zmm5						\n\t	vfmadd132pd	(%%rax),%%zmm15,%%zmm13	\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm15,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm14,%%zmm11,%%zmm11		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)							\n\t	vmovaps	%%zmm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)							\n\t	vmovaps	%%zmm9 ,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm2,    (%%rcx)							\n\t	vmovaps	%%zmm10,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)							\n\t	vmovaps	%%zmm11,0x40(%%r13)	\n\t"\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm14	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm4						\n\t	vfmadd132pd	%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm5						\n\t	vfmadd132pd	%%zmm14,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm2,%%zmm7						\n\t	vfmadd132pd	%%zmm14,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm3,%%zmm6						\n\t	vfmadd132pd	(%%rax),%%zmm11,%%zmm14		\n\t"\
		"vmovaps	%%zmm4,    (%%rax)							\n\t	vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)							\n\t	vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm7,    (%%rdx)							\n\t	vmovaps	%%zmm15,    (%%r13)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%rcx)							\n\t	vmovaps	%%zmm14,0x40(%%r12)	\n\t"\
	/******************************************************************************/\
	/*** Now do 4 DFTs with internal twiddles on the 4*stride - separated data. ***/\
	/*** Order 0,2,1,3 allows incr-only of rsi-datum from 1 block to the next: ****/\
	/******************************************************************************/\
	/* Block 0: r0-3 */												/* Block 1: r8-b */\
		"movq	%[__in0],%%rsi	\n\t	leaq %c[__i4](%%rsi),%%r8 \n\t leaq %c[__i4](%%r8 ),%%r8 \n\t"/* __in0+8*ostride */\
		"movq	%[__out0],%%rax									\n\t	leaq	0x200(%%rax),%%r10		\n\t"/* out4 */\
		"leaq	0x080(%%rax),%%rbx		/* out1 */				\n\t	leaq	0x080(%%r10),%%r11		\n\t"/* out5 */\
		"leaq	0x100(%%rax),%%rcx		/* out2 */				\n\t	leaq	0x100(%%r10),%%r12		\n\t"/* out6 */\
		"leaq	0x180(%%rax),%%rdx		/* out3 */				\n\t	leaq	0x180(%%r10),%%r13		\n\t"/* out7 */\
	/* Need separate address for Im parts of outputs due to literal-offsets below: */\
		"leaq	0x40(%%rsi),%%rdi								\n\t	leaq	0x40(%%r8 ),%%r9 	\n\t"\
		"vmovaps	        (%%rsi),%%zmm0						\n\t	vmovaps	        (%%r8 ),%%zmm8 	\n\t"/* ar */\
		"vmovaps	        (%%rdi),%%zmm1						\n\t	vmovaps	        (%%r9 ),%%zmm9 	\n\t"/* ai */\
		"vmovaps	%c[__i2](%%rsi),%%zmm2						\n\t	vmovaps	%c[__i2](%%r8 ),%%zmm10	\n\t"/* br */\
		"vmovaps	%c[__i2](%%rdi),%%zmm3						\n\t	vmovaps	%c[__i2](%%r9 ),%%zmm11	\n\t"/* bi */\
		"vmovaps	%c[__i1](%%rsi),%%zmm4						\n\t	vmovaps	%c[__i1](%%r8 ),%%zmm12	\n\t"/* cr */\
		"vmovaps	%c[__i1](%%rdi),%%zmm5						\n\t	vmovaps	%c[__i1](%%r9 ),%%zmm13	\n\t"/* ci */\
		"vmovaps	%c[__i3](%%rsi),%%zmm6						\n\t	vmovaps	%c[__i3](%%r8 ),%%zmm14	\n\t"/* dr */\
		"vmovaps	%c[__i3](%%rdi),%%zmm7						\n\t	vmovaps	%c[__i3](%%r9 ),%%zmm15	\n\t"/* di */\
		"																movq	%[__isrt2],%%r9 	\n\t"\
		"vsubpd		%%zmm2 ,%%zmm0,%%zmm0						\n\t	vsubpd		%%zmm11,%%zmm8 ,%%zmm8 	\n\t"/* ar-bi */\
		"vsubpd		%%zmm3 ,%%zmm1,%%zmm1						\n\t	vsubpd		%%zmm10,%%zmm9 ,%%zmm9 	\n\t"/* ai-br */\
		"vsubpd		%%zmm6 ,%%zmm4,%%zmm4						\n\t	vsubpd		%%zmm13,%%zmm12,%%zmm12	\n\t"/* cr-ci */\
		"vsubpd		%%zmm7 ,%%zmm5,%%zmm5						\n\t	vsubpd		%%zmm14,%%zmm15,%%zmm15	\n\t"/* di-dr */\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm14	\n\t"/* two */\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm2						\n\t	vfmadd132pd	%%zmm14,%%zmm8 ,%%zmm11	\n\t"/* ar+bi */\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm3						\n\t	vfmadd132pd	%%zmm14,%%zmm9 ,%%zmm10	\n\t"/* ai+br */\
	"vfmadd132pd	%%zmm14,%%zmm4,%%zmm6						\n\t	vfmadd132pd	%%zmm14,%%zmm12,%%zmm13	\n\t"/* cr+ci */\
	"vfmadd132pd	%%zmm14,%%zmm5,%%zmm7						\n\t	vfmadd132pd	(%%rax),%%zmm15,%%zmm14	\n\t"/* di+dr */\
		"																	vsubpd	%%zmm14,%%zmm12,%%zmm12		\n\t"\
		"																	vsubpd	%%zmm15,%%zmm13,%%zmm13		\n\t"\
		"vsubpd		%%zmm6,%%zmm2,%%zmm2						\n\t	vfmadd132pd	(%%r15),%%zmm12,%%zmm14		\n\t"\
		"vsubpd		%%zmm7,%%zmm3,%%zmm3						\n\t	vfmadd132pd	(%%r15),%%zmm13,%%zmm15		\n\t"\
		"vsubpd		%%zmm5,%%zmm0,%%zmm0						\n\t	vfnmadd231pd	(%%r9 ),%%zmm12,%%zmm8 	\n\t"/* x = x - y.isrt2 */\
		"vsubpd		%%zmm4,%%zmm1,%%zmm1						\n\t	vfnmadd231pd	(%%r9 ),%%zmm13,%%zmm10	\n\t"\
		"vmovaps	%%zmm2,    (%%rbx)							\n\t	vfnmadd231pd	(%%r9 ),%%zmm14,%%zmm9 	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rbx)							\n\t	vfnmadd231pd	(%%r9 ),%%zmm15,%%zmm11	\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)							\n\t	vmovaps	%%zmm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rdx)							\n\t	vmovaps	%%zmm10,0x40(%%r11)	\n\t"\
	"vfmadd132pd	(%%r15),%%zmm2,%%zmm6						\n\t	vmovaps	%%zmm9 ,0x40(%%r13)	\n\t"\
	"vfmadd132pd	(%%r15),%%zmm3,%%zmm7						\n\t	vmovaps	%%zmm11,    (%%r12)	\n\t"\
	"vfmadd132pd	(%%r15),%%zmm0,%%zmm5						\n\t	vfmadd132pd	-0x40(%%r9),%%zmm8 ,%%zmm12	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	"vfmadd132pd	(%%r15),%%zmm1,%%zmm4						\n\t	vfmadd132pd	-0x40(%%r9),%%zmm10,%%zmm13	\n\t"\
		"vmovaps	%%zmm6,    (%%rax)							\n\t	vfmadd132pd	-0x40(%%r9),%%zmm11,%%zmm15	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)							\n\t	vfmadd132pd	-0x40(%%r9),%%zmm9 ,%%zmm14	\n\t"\
		"vmovaps	%%zmm5,    (%%rdx)							\n\t	vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rcx)							\n\t	vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"																vmovaps	%%zmm15,    (%%r13)	\n\t"\
		"																vmovaps	%%zmm14,0x40(%%r12)	\n\t"\
	/* Block 2: */													/* Block 3: */\
		"leaq	%c[__i4](%%rsi),%%rsi	\n\t"/* __in0+4*ostride */	"	leaq	%c[__i4](%%r8 ),%%r8 	\n\t"/* __in0+c*ostride */\
		"leaq	0x40(%%rsi),%%rdi								\n\t	leaq	0x40(%%r8 ),%%r9 	\n\t"\
		"addq	$0x400,%%rax		/* out8 */					\n\t	addq	$0x400,%%r10		\n\t"/* outc */\
		"addq	$0x400,%%rbx		/* out9 */					\n\t	addq	$0x400,%%r11		\n\t"/* outd */\
		"addq	$0x400,%%rcx		/* outa */					\n\t	addq	$0x400,%%r12		\n\t"/* oute */\
		"addq	$0x400,%%rdx		/* outb */					\n\t	addq	$0x400,%%r13		\n\t"/* outf */\
		"vmovaps	%c[__i1](%%rsi),%%zmm4						\n\t	vmovaps	%c[__i1](%%r8 ),%%zmm12	\n\t"\
		"vmovaps	%c[__i3](%%rsi),%%zmm6						\n\t	vmovaps	%c[__i3](%%r8 ),%%zmm14	\n\t"\
		"vmovaps	%c[__i1](%%rdi),%%zmm5						\n\t	vmovaps	%c[__i1](%%r9 ),%%zmm13	\n\t"\
		"vmovaps	%c[__i3](%%rdi),%%zmm7						\n\t	vmovaps	%c[__i3](%%r9 ),%%zmm15	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t	addq	$0x40,%%rdi	\n\t"/* cc0, from isrt2 [rdi,rsi shared by both cols] */\
		"vmovaps	%%zmm4,%%zmm0								\n\t	vmovaps	%%zmm12,%%zmm8 		\n\t"\
	/*	"vmovaps	%%zmm6,%%zmm2								\n\t	vmovaps	%%zmm14,%%zmm10		\n\t"*/\
		"vmovaps	(%%rdi),%%zmm2								\n\t	vmovaps	0x40(%%rdi),%%zmm10	\n\t"/* Instead use these to store [c,s] */\
		"vmovaps	%%zmm5,%%zmm1								\n\t	vmovaps	%%zmm13,%%zmm9 		\n\t"\
		"vmovaps	%%zmm7,%%zmm3								\n\t	vmovaps	%%zmm15,%%zmm11		\n\t"\
		"vmulpd		    %%zmm2 ,%%zmm4,%%zmm4					\n\t	vmulpd		    %%zmm10,%%zmm12,%%zmm12	\n\t"\
		"vmulpd		    %%zmm2 ,%%zmm5,%%zmm5					\n\t	vmulpd		    %%zmm10,%%zmm13,%%zmm13	\n\t"\
		"vmulpd		    %%zmm10,%%zmm6,%%zmm6					\n\t	vmulpd		    %%zmm2 ,%%zmm14,%%zmm14	\n\t"\
		"vmulpd		    %%zmm10,%%zmm7,%%zmm7					\n\t	vmulpd		    %%zmm2 ,%%zmm15,%%zmm15	\n\t"\
	"vfnmadd231pd	    %%zmm10,%%zmm1,%%zmm4				\n\t	vfnmadd231pd	    %%zmm2 ,%%zmm9 ,%%zmm12		\n\t"\
	" vfmadd231pd	    %%zmm10,%%zmm0,%%zmm5				\n\t	 vfmadd231pd	    %%zmm2 ,%%zmm8 ,%%zmm13		\n\t"\
	"vfnmadd231pd	    %%zmm2 ,%%zmm3,%%zmm6				\n\t	vfnmadd231pd	    %%zmm10,%%zmm11,%%zmm14		\n\t"\
	" vfmadd231pd %c[__i3](%%rsi),%%zmm2,%%zmm7				\n\t	 vfmadd231pd %c[__i3](%%r8 ),%%zmm10,%%zmm15	\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4							\n\t	vsubpd	%%zmm14,%%zmm12,%%zmm12		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5							\n\t	vsubpd	%%zmm15,%%zmm13,%%zmm13		\n\t"\
	"vfmadd132pd	(%%r15),%%zmm4,%%zmm6						\n\t	vfmadd132pd	(%%r15),%%zmm12,%%zmm14		\n\t"\
	"vfmadd132pd	(%%r15),%%zmm5,%%zmm7						\n\t	vfmadd132pd	(%%r15),%%zmm13,%%zmm15		\n\t"\
		"leaq	0x40(%%rsi),%%rdi								\n\t	leaq	0x40(%%r8 ),%%r9 	\n\t"\
		"vmovaps	%c[__i2](%%rsi),%%zmm2						\n\t	vmovaps	%c[__i2](%%r8 ),%%zmm10	\n\t"\
		"vmovaps	%c[__i2](%%rdi),%%zmm3						\n\t	vmovaps	%c[__i2](%%r9 ),%%zmm11	\n\t"\
		"vmovaps	        (%%rsi),%%zmm0						\n\t	vmovaps	        (%%r8 ),%%zmm8 	\n\t"\
		"vmovaps	    0x40(%%rsi),%%zmm1						\n\t	vmovaps	    0x40(%%r8 ),%%zmm9 	\n\t"\
		"vsubpd		  %%zmm3,%%zmm2,%%zmm2						\n\t	vaddpd	%%zmm11,%%zmm10,%%zmm10	\n\t"\
		"vaddpd	%c[__i2](%%rsi),%%zmm3,%%zmm3					\n\t	vsubpd	%c[__i2](%%r8 ),%%zmm11,%%zmm11	\n\t"\
		"movq	%[__isrt2],%%r9 	\n\t"\
	"vfnmadd231pd		 (%%r9),%%zmm2,%%zmm0				\n\t	vfnmadd231pd		 (%%r9),%%zmm10,%%zmm8 	\n\t"/* x = x - y.isrt2 */\
	"vfnmadd231pd		 (%%r9),%%zmm3,%%zmm1				\n\t	vfnmadd231pd		 (%%r9),%%zmm11,%%zmm9 	\n\t"\
	" vfmadd132pd	-0x40(%%r9),%%zmm0,%%zmm2				\n\t	 vfmadd132pd	-0x40(%%r9),%%zmm8 ,%%zmm10	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	" vfmadd132pd	-0x40(%%r9),%%zmm1,%%zmm3				\n\t	 vfmadd132pd	-0x40(%%r9),%%zmm9 ,%%zmm11	\n\t"\
		"vsubpd	%%zmm6,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm7,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd	%%zmm5,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm15,%%zmm10,%%zmm10	\n\t"\
		"vsubpd	%%zmm4,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm14,%%zmm11,%%zmm11	\n\t"\
	"vmovaps	%%zmm14,(%%rax) 	\n\t"/* spill zmm14 to make room for 2.0 */"	vmovaps	(%%r15),%%zmm14	\n\t"/* two */\
		"vmovaps	%%zmm2,    (%%rbx)							\n\t	vmovaps	%%zmm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rbx)							\n\t	vmovaps	%%zmm9 ,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)							\n\t	vmovaps	%%zmm10,    (%%r12)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rdx)							\n\t	vmovaps	%%zmm11,0x40(%%r13)	\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm2,%%zmm6						\n\t	vfmadd132pd	%%zmm14,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm3,%%zmm7						\n\t	vfmadd132pd	%%zmm14,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm0,%%zmm5						\n\t	vfmadd132pd	%%zmm14,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm14,%%zmm1,%%zmm4						\n\t	vfmadd132pd	(%%rax),%%zmm11,%%zmm14		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)							\n\t	vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)							\n\t	vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm5,    (%%rdx)							\n\t	vmovaps	%%zmm15,    (%%r13)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rcx)							\n\t	vmovaps	%%zmm14,0x40(%%r12)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "e" (Xi1)\
		,[__i2] "e" (Xi2)\
		,[__i3] "e" (Xi3)\
		,[__i4] "e" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// Based on the SSE2_RADIX16_DIT_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-input addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	// We use just a single output base-pointer plus literal ostrides which are [1,2,3,4]-multiples of
	// __01; this allows us to cut GP-register usage, which is absolutely a must for the 32-bit version
	// of the macro, and is a benefit to the 64-bit versions which code-fold to yield 2 side-by-side
	// streams of independently executable instructions, one for data in xmm0-7, the other using xmm8-15.
	//
	#define SSE2_RADIX16_DIT_0TWIDDLE(Xin0,Xoff, Xisrt2,Xtwo, Xout0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%r15	\n\t	vmovaps	(%%r15),%%zmm31	\n\t"/* two */\
	"movq	%[in0],%%r14	\n\t	movq	%[off],%%r15	\n\t"/* Load input base-address into r14 and int32[16] offset-array pointer into r15 */\
		"movslq		    (%%r15),%%rax	\n\t	movslq		0x20(%%r15),%%r10	\n\t"/* off[0-3],[8-b] */\
		"movslq		0x04(%%r15),%%rbx	\n\t	movslq		0x24(%%r15),%%r11	\n\t"\
		"movslq		0x08(%%r15),%%rcx	\n\t	movslq		0x28(%%r15),%%r12	\n\t"\
		"movslq		0x0c(%%r15),%%rdx	\n\t	movslq		0x2c(%%r15),%%r13	\n\t"\
		"leaq	(%%r14,%%rax,8),%%rax	\n\t	leaq	(%%r14,%%r10,8),%%r10	\n\t"/* in0 + off[0-3],[8-b] */\
		"leaq	(%%r14,%%rbx,8),%%rbx	\n\t	leaq	(%%r14,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r14,%%rcx,8),%%rcx	\n\t	leaq	(%%r14,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r14,%%rdx,8),%%rdx	\n\t	leaq	(%%r14,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x200(%%rax)	\n\t"\
		/* Need separate address Im parts of outputs due to literal-offsets below */\
		"movq	%[__out0],%%rsi									\n\t	leaq	%c[__o4](%%rsi),%%r8 	\n\t"\
		"leaq	0x40(%%rsi),%%rdi								\n\t	addq	$%c[__o4],%%r8	 	\n\t"/* out0+8*ostride */\
		"																leaq	0x40(%%r8 ),%%r9 	\n\t"\
		/* SSE2_RADIX4_DIT_0TWIDDLE_B(r0 ): */							/* SSE2_RADIX4_DIT_0TWIDDLE_B(r16): */\
		"vmovaps	    (%%rax),%%zmm2							\n\t	vmovaps	    (%%r10),%%zmm10	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6							\n\t	vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm3							\n\t	vmovaps	0x40(%%r10),%%zmm11	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7							\n\t	vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vmovaps	    (%%rbx),%%zmm0							\n\t	vmovaps	    (%%r11),%%zmm8 	\n\t"\
		"vmovaps	    (%%rdx),%%zmm4							\n\t	vmovaps	    (%%r13),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm1							\n\t	vmovaps	0x40(%%r11),%%zmm9 	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5							\n\t	vmovaps	0x40(%%r13),%%zmm13	\n\t"\
		"vsubpd	%%zmm0,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm8 ,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm4,%%zmm6,%%zmm6							\n\t	vsubpd	%%zmm12,%%zmm14,%%zmm14		\n\t"\
		"vsubpd	%%zmm1,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm9 ,%%zmm11,%%zmm11		\n\t"\
		"vsubpd	%%zmm5,%%zmm7,%%zmm7							\n\t	vsubpd	%%zmm13,%%zmm15,%%zmm15		\n\t"\
		/* Remember, v8_double(2.0) in zmm31: */\
	"vfmadd132pd	%%zmm31,%%zmm2,%%zmm0						\n\t	vfmadd132pd	%%zmm31,%%zmm10,%%zmm8 		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm6,%%zmm4						\n\t	vfmadd132pd	%%zmm31,%%zmm14,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm3,%%zmm1						\n\t	vfmadd132pd	%%zmm31,%%zmm11,%%zmm9 		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm7,%%zmm5						\n\t	vfmadd132pd	%%zmm31,%%zmm15,%%zmm13		\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm15,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm14,%%zmm11,%%zmm11		\n\t"\
	"prefetcht1	0x200(%%rcx)								\n\t	prefetcht1	0x200(%%r12)	\n\t"\
		"vmovaps	%%zmm0,%c[__o2](%%rsi)						\n\t	vmovaps	%%zmm8 ,%c[__o2](%%r8 )	\n\t"\
		"vmovaps	%%zmm2,%c[__o3](%%rsi)						\n\t	vmovaps	%%zmm10,%c[__o3](%%r8 )	\n\t"\
		"vmovaps	%%zmm1,%c[__o2](%%rdi)						\n\t	vmovaps	%%zmm9 ,%c[__o2](%%r9 )	\n\t"\
		"vmovaps	%%zmm3,%c[__o1](%%rdi)						\n\t	vmovaps	%%zmm11,%c[__o1](%%r9 )	\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm0,%%zmm4						\n\t	vfmadd132pd	%%zmm31,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm2,%%zmm7						\n\t	vfmadd132pd	%%zmm31,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm1,%%zmm5						\n\t	vfmadd132pd	%%zmm31,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm3,%%zmm6						\n\t	vfmadd132pd	%%zmm31,%%zmm11,%%zmm14		\n\t"\
		"vmovaps	%%zmm4,        (%%rsi)						\n\t	vmovaps	%%zmm12,        (%%r8 )	\n\t"\
		"vmovaps	%%zmm7,%c[__o1](%%rsi)						\n\t	vmovaps	%%zmm15,%c[__o1](%%r8 )	\n\t"\
		"vmovaps	%%zmm5,        (%%rdi)						\n\t	vmovaps	%%zmm13,        (%%r9 )	\n\t"\
		"vmovaps	%%zmm6,%c[__o3](%%rdi)						\n\t	vmovaps	%%zmm14,%c[__o3](%%r9 )	\n\t"\
	"prefetcht1	0x200(%%rax)								\n\t	prefetcht1	0x200(%%r10)\n\t"\
		/* SSE2_RADIX4_DIT_0TWIDDLE_B(r8 ): */						/* SSE2_RADIX4_DIT_0TWIDDLE_B(r24): */\
		"movslq		0x10(%%r15),%%rax	\n\t	movslq		0x30(%%r15),%%r10	\n\t"/* off[4-7],[c-f] */\
		"movslq		0x14(%%r15),%%rbx	\n\t	movslq		0x34(%%r15),%%r11	\n\t"\
		"movslq		0x18(%%r15),%%rcx	\n\t	movslq		0x38(%%r15),%%r12	\n\t"\
		"movslq		0x1c(%%r15),%%rdx	\n\t	movslq		0x3c(%%r15),%%r13	\n\t"\
		"leaq	(%%r14,%%rax,8),%%rax	\n\t	leaq	(%%r14,%%r10,8),%%r10	\n\t"/* in0 + off[4-7],[c-f] */\
		"leaq	(%%r14,%%rbx,8),%%rbx	\n\t	leaq	(%%r14,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r14,%%rcx,8),%%rcx	\n\t	leaq	(%%r14,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r14,%%rdx,8),%%rdx	\n\t	leaq	(%%r14,%%r13,8),%%r13	\n\t"\
		"addq	$%c[__o4],%%rsi			\n\t	addq	$%c[__o4],%%r8	 	\n\t"/* out0+[4,c]*ostride */\
		"vmovaps	    (%%rax),%%zmm2							\n\t	vmovaps	    (%%r10),%%zmm10	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6							\n\t	vmovaps	    (%%r12),%%zmm14	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm3							\n\t	vmovaps	0x40(%%r10),%%zmm11	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7							\n\t	vmovaps	0x40(%%r12),%%zmm15	\n\t"\
		"vmovaps	    (%%rbx),%%zmm0							\n\t	vmovaps	    (%%r11),%%zmm8 	\n\t"\
		"vmovaps	    (%%rdx),%%zmm4							\n\t	vmovaps	    (%%r13),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm1							\n\t	vmovaps	0x40(%%r11),%%zmm9 	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5							\n\t	vmovaps	0x40(%%r13),%%zmm13	\n\t"\
		"vsubpd	%%zmm0,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm8 ,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm4,%%zmm6,%%zmm6							\n\t	vsubpd	%%zmm12,%%zmm14,%%zmm14		\n\t"\
		"vsubpd	%%zmm1,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm9 ,%%zmm11,%%zmm11		\n\t"\
		"vsubpd	%%zmm5,%%zmm7,%%zmm7							\n\t	vsubpd	%%zmm13,%%zmm15,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm2,%%zmm0						\n\t	vfmadd132pd	%%zmm31,%%zmm10,%%zmm8 		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm6,%%zmm4						\n\t	vfmadd132pd	%%zmm31,%%zmm14,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm3,%%zmm1						\n\t	vfmadd132pd	%%zmm31,%%zmm11,%%zmm9 		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm7,%%zmm5						\n\t	vfmadd132pd	%%zmm31,%%zmm15,%%zmm13		\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm15,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm13,%%zmm9 ,%%zmm9 		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm14,%%zmm11,%%zmm11		\n\t"\
	"prefetcht1	0x200(%%rcx)								\n\t	prefetcht1	0x200(%%r12)	\n\t"\
		"leaq	0x40(%%rsi),%%rdi								\n\t	leaq	0x40(%%r8 ),%%r9 	\n\t"\
		"vmovaps	%%zmm0,%c[__o2](%%rsi)						\n\t	vmovaps	%%zmm8 ,%c[__o2](%%r8 )	\n\t"\
		"vmovaps	%%zmm2,%c[__o3](%%rsi)						\n\t	vmovaps	%%zmm10,%c[__o3](%%r8 )	\n\t"\
		"vmovaps	%%zmm1,%c[__o2](%%rdi)						\n\t	vmovaps	%%zmm9 ,%c[__o2](%%r9 )	\n\t"\
		"vmovaps	%%zmm3,%c[__o1](%%rdi)						\n\t	vmovaps	%%zmm11,%c[__o1](%%r9 )	\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm0,%%zmm4						\n\t	vfmadd132pd	%%zmm31,%%zmm8 ,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm2,%%zmm7						\n\t	vfmadd132pd	%%zmm31,%%zmm10,%%zmm15		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm1,%%zmm5						\n\t	vfmadd132pd	%%zmm31,%%zmm9 ,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm3,%%zmm6						\n\t	vfmadd132pd	%%zmm31,%%zmm11,%%zmm14		\n\t"\
		"vmovaps	%%zmm4,        (%%rsi)						\n\t	vmovaps	%%zmm12,        (%%r8 )	\n\t"\
		"vmovaps	%%zmm7,%c[__o1](%%rsi)						\n\t	vmovaps	%%zmm15,%c[__o1](%%r8 )	\n\t"\
		"vmovaps	%%zmm5,        (%%rdi)						\n\t	vmovaps	%%zmm13,        (%%r9 )	\n\t"\
		"vmovaps	%%zmm6,%c[__o3](%%rdi)						\n\t	vmovaps	%%zmm14,%c[__o3](%%r9 )	\n\t"\
	"prefetcht1	0x200(%%rax)	\n\t"\
	/******************************************************************************/\
	/*** Now do 4 DFTs with internal twiddles on the 4*stride - separated data: ***/\
	/******************************************************************************/\
		/* From here onward, no longer need r14,r15 for I-addressing */\
		/* Block 0: */												/* Block 2: */\
		"movq	%[__out0],%%rax									\n\t	leaq	%c[__o2](%%rax),%%r10	\n\t"/* All addresses += 2*ostride */\
	"leaq %c[__o4](%%rax),%%rbx \n\t"/* out0+  [4*ostride] */"	\n\t	leaq	%c[__o2](%%rbx),%%r11	\n\t"\
	"leaq %c[__o4](%%rbx),%%rcx \n\t"/* out0+2*[4*ostride] */"	\n\t	leaq	%c[__o2](%%rcx),%%r12	\n\t"\
	"leaq %c[__o4](%%rcx),%%rdx \n\t"/* out0+3*[4*ostride] */"	\n\t	leaq	%c[__o2](%%rdx),%%r13	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"/* Stick isrt2 and sqrt2 into zmm29.30,resp: */\
		"vmovaps	(%%rdi),%%zmm29	\n\t	vmovaps	-0x40(%%rdi),%%zmm30 	\n\t"\
		"vmovaps	    (%%rax),%%zmm0							\n\t	vmovaps	    (%%r10),%%zmm8 	\n\t"/* ar */\
		"vmovaps	0x40(%%rax),%%zmm1							\n\t	vmovaps	0x40(%%r10),%%zmm9 	\n\t"/* ai */\
		"vmovaps	    (%%rbx),%%zmm2							\n\t	vmovaps	    (%%r11),%%zmm10	\n\t"/* br */\
		"vmovaps	0x40(%%rbx),%%zmm3							\n\t	vmovaps	0x40(%%r11),%%zmm11	\n\t"/* bi */\
		"vmovaps	    (%%rcx),%%zmm4							\n\t	vmovaps	    (%%r12),%%zmm12	\n\t"/* cr */\
		"vmovaps	0x40(%%rcx),%%zmm5							\n\t	vmovaps	0x40(%%r12),%%zmm13	\n\t"/* ci */\
		"vmovaps	    (%%rdx),%%zmm6							\n\t	vmovaps	    (%%r13),%%zmm14	\n\t"/* dr */\
		"vmovaps	0x40(%%rdx),%%zmm7							\n\t	vmovaps	0x40(%%r13),%%zmm15	\n\t"/* di */\
		"vsubpd	    %%zmm2 ,%%zmm0,%%zmm0						\n\t	vsubpd	    %%zmm11,%%zmm8 ,%%zmm8 	\n\t"/* ar-bi */\
		"vsubpd	    %%zmm3 ,%%zmm1,%%zmm1						\n\t	vsubpd	    %%zmm10,%%zmm9 ,%%zmm9 	\n\t"/* ai-br */\
		"vsubpd	    %%zmm6 ,%%zmm4,%%zmm4						\n\t	vsubpd	    %%zmm12,%%zmm13,%%zmm13	\n\t"/* ci-cr */\
		"vsubpd	    %%zmm7 ,%%zmm5,%%zmm5						\n\t	vsubpd	    %%zmm15,%%zmm14,%%zmm14	\n\t"/* dr-di */\
		"vfmadd132pd	%%zmm31,%%zmm0,%%zmm2					\n\t	vfmadd132pd	%%zmm31,%%zmm8 ,%%zmm11	\n\t"/* ar+bi */\
		"vfmadd132pd	%%zmm31,%%zmm1,%%zmm3					\n\t	vfmadd132pd	%%zmm31,%%zmm9 ,%%zmm10	\n\t"/* ai+br */\
		"vfmadd132pd	%%zmm31,%%zmm4,%%zmm6					\n\t	vfmadd132pd	%%zmm31,%%zmm13,%%zmm12	\n\t"/* ci+cr */\
		"vfmadd132pd	%%zmm31,%%zmm5,%%zmm7					\n\t	vfmadd132pd	%%zmm31,%%zmm14,%%zmm15	\n\t"/* dr+di */\
		"														\n\t		vsubpd	%%zmm14,%%zmm12,%%zmm12		\n\t"\
		"														\n\t		vsubpd	%%zmm15,%%zmm13,%%zmm13		\n\t"\
		"vsubpd	%%zmm5,%%zmm0,%%zmm0							\n\t	vfmadd132pd	%%zmm31,%%zmm12,%%zmm14		\n\t"\
		"vsubpd	%%zmm4,%%zmm1,%%zmm1							\n\t	vfmadd132pd	%%zmm31,%%zmm13,%%zmm15		\n\t"\
		"vsubpd	%%zmm6,%%zmm2,%%zmm2							\n\t	vfnmadd231pd	%%zmm29,%%zmm12,%%zmm11		\n\t"/* x = x - y.isrt2 */\
		"vsubpd	%%zmm7,%%zmm3,%%zmm3							\n\t	vfnmadd231pd	%%zmm29,%%zmm13,%%zmm9 		\n\t"\
		"vmovaps	%%zmm0,    (%%rdx)							\n\t	vfnmadd231pd	%%zmm29,%%zmm14,%%zmm10		\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)							\n\t	vfnmadd231pd	%%zmm29,%%zmm15,%%zmm8 		\n\t"\
		"vmovaps	%%zmm2,    (%%rcx)							\n\t		vmovaps	%%zmm11,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rcx)							\n\t		vmovaps	%%zmm9 ,0x40(%%r12)	\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm0,%%zmm5						\n\t		vmovaps	%%zmm10,0x40(%%r11)	\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm1,%%zmm4						\n\t		vmovaps	%%zmm8 ,    (%%r13)	\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm2,%%zmm6						\n\t	vfmadd132pd	%%zmm30,%%zmm11,%%zmm12	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	"vfmadd132pd	%%zmm31,%%zmm3,%%zmm7						\n\t	vfmadd132pd	%%zmm30,%%zmm9 ,%%zmm13	\n\t"\
		"vmovaps	%%zmm5,    (%%rbx)							\n\t	vfmadd132pd	%%zmm30,%%zmm10,%%zmm14	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rdx)							\n\t	vfmadd132pd	%%zmm30,%%zmm8 ,%%zmm15	\n\t"\
		"vmovaps	%%zmm6,    (%%rax)							\n\t		vmovaps	%%zmm12,    (%%r10)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)							\n\t		vmovaps	%%zmm13,0x40(%%r10)	\n\t"\
		"														\n\t		vmovaps	%%zmm14,0x40(%%r13)	\n\t"\
		"														\n\t		vmovaps	%%zmm15,    (%%r11)	\n\t"\
		/* Block 1: */												/* Block 3: */\
	"addq $%c[__o1],%%rax\n\t"/* addr += 1*ostride */"	\n\t	leaq	%c[__o2](%%rax),%%r10	\n\t"/* All addresses += 1*ostride */\
	"addq $%c[__o1],%%rbx\n\t"/* relative to Block 0 */"\n\t	leaq	%c[__o2](%%rbx),%%r11	\n\t"/* relative to Block 1 */\
	"addq $%c[__o1],%%rcx								\n\t	leaq	%c[__o2](%%rcx),%%r12	\n\t"\
	"addq $%c[__o1],%%rdx								\n\t	leaq	%c[__o2](%%rdx),%%r13	\n\t"\
		"leaq	0x40(%%rdi),%%rsi	\n\t"/* cc0, from isrt2 [rdi,rsi shared by both cols] */\
		"vmovaps	    (%%rdx),%%zmm0							\n\t	vmovaps	    (%%r13),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm1							\n\t	vmovaps	0x40(%%r13),%%zmm9 	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4							\n\t	vmovaps	    (%%r12),%%zmm12	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5							\n\t	vmovaps	0x40(%%r12),%%zmm13	\n\t"\
		"vmovaps	    %%zmm0 ,%%zmm2							\n\t	vmovaps	    %%zmm8 ,%%zmm10	\n\t"\
		"vmovaps	    %%zmm1 ,%%zmm3							\n\t	vmovaps	    %%zmm9 ,%%zmm11	\n\t"\
		"vmovaps	    %%zmm4 ,%%zmm6							\n\t	vmovaps	    %%zmm12,%%zmm14	\n\t"\
	/*	"vmovaps	    %%zmm5 ,%%zmm7							\n\t	vmovaps	    %%zmm13,%%zmm15	\n\t"*/\
		"vmovaps	0x40(%%rsi),%%zmm7							\n\t	vmovaps	    (%%rsi),%%zmm15	\n\t"/* Instead use these to store [c,s] */\
		"vmulpd		    %%zmm7 ,%%zmm0,%%zmm0					\n\t	vmulpd		    %%zmm15,%%zmm8 ,%%zmm8 	\n\t"\
		"vmulpd		    %%zmm7 ,%%zmm1,%%zmm1					\n\t	vmulpd		    %%zmm15,%%zmm9 ,%%zmm9 	\n\t"\
		"vmulpd		    %%zmm15,%%zmm4,%%zmm4					\n\t	vmulpd		    %%zmm7 ,%%zmm12,%%zmm12	\n\t"\
		"vmulpd		    %%zmm15,%%zmm5,%%zmm5					\n\t	vmulpd		    %%zmm7 ,%%zmm13,%%zmm13	\n\t"\
	"vfnmadd231pd	    %%zmm15,%%zmm2,%%zmm1				\n\t	vfnmadd231pd	    %%zmm7 ,%%zmm10,%%zmm9 	\n\t"\
	" vfmadd231pd	    %%zmm15,%%zmm3,%%zmm0				\n\t	 vfmadd231pd	    %%zmm7 ,%%zmm11,%%zmm8 	\n\t"\
	"vfnmadd231pd	    %%zmm7 ,%%zmm6,%%zmm5				\n\t	vfnmadd231pd	    %%zmm15,%%zmm14,%%zmm13	\n\t"\
	" vfmadd231pd	0x40(%%rcx),%%zmm7,%%zmm4				\n\t	 vfmadd231pd	0x40(%%r12),%%zmm15,%%zmm12	\n\t"\
		"vmovaps	%%zmm5,%%zmm7								\n\t	vmovaps	%%zmm13,%%zmm15		\n\t"\
		"vmovaps	%%zmm4,%%zmm6								\n\t	vmovaps	%%zmm12,%%zmm14		\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4							\n\t	vaddpd	%%zmm8 ,%%zmm12,%%zmm12		\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5							\n\t	vaddpd	%%zmm9 ,%%zmm13,%%zmm13		\n\t"\
		"vsubpd	%%zmm0,%%zmm6,%%zmm6							\n\t	vsubpd	%%zmm8 ,%%zmm14,%%zmm14		\n\t"\
		"vsubpd	%%zmm1,%%zmm7,%%zmm7							\n\t	vsubpd	%%zmm9 ,%%zmm15,%%zmm15		\n\t"\
		"vmovaps	    (%%rbx),%%zmm2							\n\t	vmovaps	    (%%r11),%%zmm10	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3							\n\t	vmovaps	0x40(%%r11),%%zmm11	\n\t"\
		"vmovaps	    (%%rax),%%zmm0							\n\t	vmovaps	    (%%r10),%%zmm8 	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1							\n\t	vmovaps	0x40(%%r10),%%zmm9 	\n\t"\
		"vaddpd	0x40(%%rbx),%%zmm2,%%zmm2						\n\t	vsubpd	0x40(%%r11),%%zmm10,%%zmm10	\n\t"\
		"vsubpd	    (%%rbx),%%zmm3,%%zmm3						\n\t	vaddpd	    (%%r11),%%zmm11,%%zmm11	\n\t"\
	"vfnmadd231pd	%%zmm29,%%zmm2,%%zmm0						\n\t	vfnmadd231pd	%%zmm29,%%zmm10,%%zmm8 	\n\t"/* x = x - y.isrt2 */\
	"vfnmadd231pd	%%zmm29,%%zmm3,%%zmm1						\n\t	vfnmadd231pd	%%zmm29,%%zmm11,%%zmm9 	\n\t"\
	" vfmadd132pd	%%zmm30,%%zmm0,%%zmm2						\n\t	 vfmadd132pd	%%zmm30,%%zmm8 ,%%zmm10	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	" vfmadd132pd	%%zmm30,%%zmm1,%%zmm3						\n\t	 vfmadd132pd	%%zmm30,%%zmm9 ,%%zmm11	\n\t"\
		"vsubpd	%%zmm7,%%zmm0,%%zmm0							\n\t	vsubpd	%%zmm13,%%zmm10,%%zmm10		\n\t"\
		"vsubpd	%%zmm6,%%zmm1,%%zmm1							\n\t	vsubpd	%%zmm12,%%zmm11,%%zmm11		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2							\n\t	vsubpd	%%zmm14,%%zmm8 ,%%zmm8 		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3							\n\t	vsubpd	%%zmm15,%%zmm9 ,%%zmm9 		\n\t"\
		"vmovaps	%%zmm0,    (%%rdx)							\n\t	vmovaps	%%zmm10,    (%%r13)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)							\n\t	vmovaps	%%zmm11,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm2,    (%%rcx)							\n\t	vmovaps	%%zmm8 ,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rcx)							\n\t	vmovaps	%%zmm9 ,0x40(%%r12)	\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm0,%%zmm7						\n\t	vfmadd132pd	%%zmm31,%%zmm10,%%zmm13		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm1,%%zmm6						\n\t	vfmadd132pd	%%zmm31,%%zmm11,%%zmm12		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm2,%%zmm4						\n\t	vfmadd132pd	%%zmm31,%%zmm8 ,%%zmm14		\n\t"\
	"vfmadd132pd	%%zmm31,%%zmm3,%%zmm5						\n\t	vfmadd132pd	%%zmm31,%%zmm9 ,%%zmm15		\n\t"\
		"vmovaps	%%zmm7,    (%%rbx)							\n\t	vmovaps	%%zmm13,    (%%r11)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%rdx)							\n\t	vmovaps	%%zmm12,0x40(%%r13)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)							\n\t	vmovaps	%%zmm14,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)							\n\t	vmovaps	%%zmm15,0x40(%%r10)	\n\t"\
		:					/* outputs: none */\
		:[in0] "m" (Xin0)	/* Input-address-16-tet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 16 double* index offsets */\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		,[__o1] "e" (Xo1)\
		,[__o2] "e" (Xo2)\
		,[__o3] "e" (Xo3)\
		,[__o4] "e" (Xo4)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","xmm29","xmm30","xmm31"	/* Clobbered registers */\
	);\
	}

	/* With-twiddles out-of-place analog of above twiddleless DIT macro: 15 nontrivial complex input twiddles E1-f [E0 assumed = 1],
	The DIT version of this macro processes the twiddles in-order.
	NOTE: SINCE THIS MACRO IS SPECIFICALLY DESIGNED AS THE 2ND-PASS OF LARGE-POWER-OF-2-TWIDDLELESS DFT SYNTHESIS, THE
	"TWIDDLES" HERE ARE PURELY OF THE DFT-INTERNAL VARIETY, AND THUS APPLIED TO THE INPUTS, JUST AS FOR THE ABOVE DIF COUNTERPART.

	Sincos layout - Two portions:
	[NOTE: bytewise offsets below are w.r.to SSE2 version of code; AVX doubles these]

	Radix-16 shared consts anchored at isrt2:

	  isrt2 + 0x000;	cc0 + 0x010;	ss0 + 0x020;

	Per-block-specific set of 15 complex twiddles anchored at c1:

		c1  + 0x000;	s1  + 0x010;
		c2  + 0x020;	s2  + 0x030;
		c3  + 0x040;	s3  + 0x050;
		c4  + 0x060;	s4  + 0x070;
		c5  + 0x080;	s5  + 0x090;
		c6  + 0x0a0;	s6  + 0x0b0;
		c7  + 0x0c0;	s7  + 0x0d0;
		c8  + 0x0e0;	s8  + 0x0f0;
		c9  + 0x100;	s9  + 0x110;
		c10 + 0x120;	s10 + 0x130;
		c11 + 0x140;	s11 + 0x150;
		c12 + 0x160;	s12 + 0x170;
		c13 + 0x180;	s13 + 0x190;
		c14 + 0x1a0;	s14 + 0x1b0;
		c15 + 0x1c0;	s15 + 0x1d0;

	Use radix-16 DIF as template for DIT/OOP here, since need a pre-twiddles algorithm:
	*/
	#define SSE2_RADIX16_DIT_TWIDDLE_OOP(Xin0,Xi1,Xi2,Xi3,Xi4, Xout0,Xo1,Xo2,Xo3,Xo4, Xisrt2,Xc1)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i2](%%rax),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i3](%%rax),%%rdx	\n\t"/* __in0 + 3*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"movq	%[__c1],%%rsi 	/* c1 */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* c2,3 */\n\t"\
		"vmovaps	    (%%rdx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* c3 */\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	/* c2 */\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vaddpd	    (%%rax),%%zmm6,%%zmm6	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		/* DIT has outputs (indexed in real-temp form as 0-7) 2/6,3/7 swapped, i.e. swap oregs c/d vs DIF: */\
		"vmovaps	%%zmm0,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rcx)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm5,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* c4,5 */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rsi),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm7	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vmulpd	%%zmm6,%%zmm0,%%zmm0		/* c4 */\n\t"\
		"vmulpd	%%zmm6,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* c5 */\n\t"\
		"vaddpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* c6,7 */\n\t"\
		"vmovaps	    (%%rdx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* c7 */\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	/* c6 */\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vaddpd	    (%%rax),%%zmm6,%%zmm6	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rcx)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm5,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* c8,9 */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rsi),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm7	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vmulpd	%%zmm6,%%zmm0,%%zmm0		/* c8 */\n\t"\
		"vmulpd	%%zmm6,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* c9 */\n\t"\
		"vaddpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* ca,b */\n\t"\
		"vmovaps	    (%%rdx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* cb */\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	/* ca */\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vaddpd	    (%%rax),%%zmm6,%%zmm6	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rcx)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm5,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* cc,d */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rsi),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm7	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vmulpd	%%zmm6,%%zmm0,%%zmm0		/* cc */\n\t"\
		"vmulpd	%%zmm6,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* cd */\n\t"\
		"vaddpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* ce,f */\n\t"\
		"vmovaps	    (%%rdx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* cf */\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	/* ce */\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vaddpd	    (%%rax),%%zmm6,%%zmm6	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rbx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rcx)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm5,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rdx)	\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
		"movq	%[__isrt2],%%rsi 	\n\t"\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm2,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm3,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm0,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm4,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm5,%%zmm7,%%zmm7		\n\t"\
		"vsubpd	%%zmm6,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"movq	%[__out0],%%r10		\n\t"\
		"leaq	%c[__o4](%%r10),%%r11	\n\t"/* __out0 + 4*ostride */\
		"leaq	%c[__o4](%%r11),%%r12	\n\t"/* __out0 + 8*ostride */\
		"leaq	%c[__o4](%%r12),%%r13	\n\t"/* __out0 + c*ostride */\
		"vmovaps	%%zmm2,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%r12)	\n\t"\
		"vaddpd	%%zmm2,%%zmm6,%%zmm6	\n\t"\
		"vaddpd	%%zmm3,%%zmm7,%%zmm7	\n\t"\
		"vmovaps	%%zmm6,    (%%r10)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%r10)	\n\t"\
		"vsubpd	%%zmm5,%%zmm0,%%zmm0	\n\t"\
		"vsubpd	%%zmm4,%%zmm1,%%zmm1	\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5	\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4	\n\t"\
		"vmovaps	%%zmm0,    (%%r13)	\n\t"/* These 2 outputs [4/c] swapped w.r.to dif [2/3] due to +-I sign diff */\
		"vmovaps	%%zmm1,0x40(%%r11)	\n\t"\
		"vaddpd	%%zmm0,%%zmm5,%%zmm5	\n\t"\
		"vaddpd	%%zmm1,%%zmm4,%%zmm4	\n\t"\
		"vmovaps	%%zmm5,    (%%r11)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%r13)	\n\t"\
	/* Block 1: Combine 1-output of each radix-4, i.e. inputs from __in0 + [1,5,9,d]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
		"vmovaps	    (%%rdx),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm1	\n\t"\
		"vmovaps	    (%%rdx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm3	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm0,%%zmm0	\n\t"/* ss0 */\
		"vmulpd	0x80(%%rsi),%%zmm1,%%zmm1	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm2,%%zmm2	\n\t"/* cc0 */\
		"vmulpd	0x40(%%rsi),%%zmm3,%%zmm3	\n\t"\
		"vsubpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm4,%%zmm4	\n\t"/* cc0 */\
		"vmulpd	0x40(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm6,%%zmm6	\n\t"/* ss0 */\
		"vmulpd	0x80(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vsubpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vaddpd	0x40(%%rbx),%%zmm2,%%zmm2	\n\t"\
		"vsubpd	    (%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vmulpd	    (%%rsi),%%zmm2,%%zmm2	\n\t"/* isrt2 */\
		"vmulpd	    (%%rsi),%%zmm3,%%zmm3	\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm2,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm3,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm0,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 1*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + 5*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + 9*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + d*ostride */\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm2,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%r12)	\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm2,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm3,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm4,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%r10)	\n\t"\
		"vsubpd	%%zmm7,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm6,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm0,    (%%r13)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%r11)	\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm0,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm1,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	%%zmm7,    (%%r11)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%r13)	\n\t"\
	/* Block 2: Combine 2-output of each radix-4, i.e. inputs from __in0 + [2,6,a,e]*istride: */\
		"vmovaps	(%%rsi),%%zmm2	/* isrt2 */\n\t"\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm1	\n\t"\
		"vaddpd	0x40(%%rcx),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	    (%%rcx),%%zmm5,%%zmm5	\n\t"\
		"vsubpd	0x40(%%rdx),%%zmm0,%%zmm0	\n\t"\
		"vaddpd	    (%%rdx),%%zmm1,%%zmm1	\n\t"\
		"vmulpd	%%zmm2,%%zmm4,%%zmm4		\n\t"\
		"vmulpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vmulpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vsubpd	%%zmm0,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm1,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vsubpd	0x40(%%rbx),%%zmm0,%%zmm0	\n\t"\
		"vsubpd	    (%%rbx),%%zmm1,%%zmm1	\n\t"\
		"vaddpd	    (%%rax),%%zmm3,%%zmm3	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm2,%%zmm2	\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 2*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + 6*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + a*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + e*ostride */\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm3,    (%%r12)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%r12)	\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm4,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%r10)	\n\t"\
		"vsubpd	%%zmm7,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm6,%%zmm2,%%zmm2		\n\t"\
		"vmovaps	%%zmm0,    (%%r13)	\n\t"\
		"vmovaps	%%zmm2,0x40(%%r11)	\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm0,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm2,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	%%zmm7,    (%%r11)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%r13)	\n\t"\
	/* Block 3: Combine 3-output of each radix-4, i.e. inputs from __in0 + [3,7,b,f]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
		"vmovaps	    (%%rdx),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm1	\n\t"\
		"vmovaps	    (%%rdx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm3	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm0,%%zmm0	\n\t"/* cc0 */\
		"vmulpd	0x40(%%rsi),%%zmm1,%%zmm1	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm2,%%zmm2	\n\t"/* ss0 */\
		"vmulpd	0x80(%%rsi),%%zmm3,%%zmm3	\n\t"\
		"vsubpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	\n\t"/* ss0 */\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"/* cc0 */\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vsubpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vsubpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vsubpd	0x40(%%rbx),%%zmm2,%%zmm2	\n\t"\
		"vaddpd	    (%%rbx),%%zmm3,%%zmm3	\n\t"\
		"vmulpd	    (%%rsi),%%zmm2,%%zmm2	\n\t"/* isrt2 */\
		"vmulpd	    (%%rsi),%%zmm3,%%zmm3	\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm2,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm3,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm0,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 3*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + 7*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + b*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + f*ostride */\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm0,    (%%r12)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%r12)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	%%zmm6,    (%%r10)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%r10)	\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm2,    (%%r13)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%r11)	\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,    (%%r11)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%r13)	\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__o1] "e" (Xo1)\
		 ,[__o2] "e" (Xo2)\
		 ,[__o3] "e" (Xo3)\
		 ,[__o4] "e" (Xo4)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__c1] "m" (Xc1)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// DIF version of above shares same sincos layout & data:
	#define SSE2_RADIX16_DIF_TWIDDLE_OOP(Xin0,Xi1,Xi4, Xout0,Xoff, Xisrt2,Xc1)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i1](%%rcx),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i1](%%rbx),%%rdx	\n\t"/* __in0 + 3*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"movq	%[__c1],%%rsi 	/* Roots sets c1-15 same as for DIT, w/c1 as base-ptr */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* c2,3 */\n\t"\
		"vmovaps	    (%%rdx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* c3 */\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	/* c2 */\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vaddpd	    (%%rax),%%zmm6,%%zmm6	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rcx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm5,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* c4,5 */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rsi),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm7	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vmulpd	%%zmm6,%%zmm0,%%zmm0		/* c4 */\n\t"\
		"vmulpd	%%zmm6,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* c5 */\n\t"\
		"vsubpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* c6,7 */\n\t"\
		"vmovaps	    (%%rdx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* c7 */\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	/* c6 */\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vaddpd	    (%%rax),%%zmm6,%%zmm6	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rcx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm5,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* c8,9 */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rsi),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm7	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vmulpd	%%zmm6,%%zmm0,%%zmm0		/* c8 */\n\t"\
		"vmulpd	%%zmm6,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* c9 */\n\t"\
		"vsubpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* ca,b */\n\t"\
		"vmovaps	    (%%rdx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* cb */\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	/* ca */\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vaddpd	    (%%rax),%%zmm6,%%zmm6	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rcx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm5,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* cc,d */\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rsi),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm7	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vmulpd	%%zmm6,%%zmm0,%%zmm0		/* cc */\n\t"\
		"vmulpd	%%zmm6,%%zmm1,%%zmm1		\n\t"\
		"vmulpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* cd */\n\t"\
		"vsubpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm0,%%zmm2		\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm1,%%zmm3		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm3,%%zmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x100,%%rsi 	/* ce,f */\n\t"\
		"vmovaps	    (%%rdx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm4,%%zmm4	/* cf */\n\t"\
		"vmulpd	0x80(%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0xc0(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm7	\n\t"\
		"vmulpd	    (%%rsi),%%zmm4,%%zmm4	/* ce */\n\t"\
		"vmulpd	    (%%rsi),%%zmm5,%%zmm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm6,%%zmm6	\n\t"\
		"vmulpd	0x40(%%rsi),%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vsubpd	    (%%rax),%%zmm4,%%zmm4	\n\t"\
		"vsubpd	0x40(%%rax),%%zmm5,%%zmm5	\n\t"\
		"vaddpd	    (%%rax),%%zmm6,%%zmm6	\n\t"\
		"vaddpd	0x40(%%rax),%%zmm7,%%zmm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%zmm6,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm4,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	%%zmm0,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm2,    (%%rbx)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%rcx)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%rdx)	\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm0,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm6,    (%%rax)	\n\t"\
		"vmovaps	%%zmm5,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%rax)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%rbx)	\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm2,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm3,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm0,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm4,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm5,%%zmm7,%%zmm7		\n\t"\
		"vsubpd	%%zmm6,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
	/* Load output base-address into r8 and offset-array pointer into r9: */\
	"movq	%[__out0],%%r8	\n\t	movq	%[__off],%%r9	\n\t"\
	/* Block 0: r0-3 */\
		"movslq		    (%%r9),%%r10	\n\t"/* off0-3 */\
		"movslq		0x04(%%r9),%%r11	\n\t"\
		"movslq		0x08(%%r9),%%r12	\n\t"\
		"movslq		0x0c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off0-3 */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x200(%%r11)\n\t"\
		"vmovaps	%%zmm2,    (%%r11)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%r11)	\n\t"\
		"vaddpd	%%zmm2,%%zmm6,%%zmm6	\n\t"\
		"vaddpd	%%zmm3,%%zmm7,%%zmm7	\n\t"\
		"vmovaps	%%zmm6,    (%%r10)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%r10)	\n\t"\
		"vsubpd	%%zmm5,%%zmm0,%%zmm0	\n\t"\
		"vsubpd	%%zmm4,%%zmm1,%%zmm1	\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5	\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4	\n\t"\
		"vmovaps	%%zmm0,    (%%r12)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%r13)	\n\t"\
		"vaddpd	%%zmm0,%%zmm5,%%zmm5	\n\t"\
		"vaddpd	%%zmm1,%%zmm4,%%zmm4	\n\t"\
		"vmovaps	%%zmm5,    (%%r13)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%r12)	\n\t"\
	/* Block 2: Combine 2-output of each radix-4, i.e. inputs from __in0 + [4,5,6,7]*istride: */\
		"movq	%[__isrt2],%%rsi 	\n\t"\
		"vmovaps	(%%rsi),%%zmm3	/* isrt2 */\n\t"\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
	"prefetcht1	0x200(%%r13)\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	    (%%rdx),%%zmm6	\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm7	\n\t"\
		"vmulpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmulpd	%%zmm3,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vmulpd	%%zmm3,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmulpd	%%zmm3,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vsubpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm5,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm2,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm6,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm3,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm2,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm0,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm4,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm1,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm7,%%zmm6,%%zmm6		\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm7,%%zmm7		\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vsubpd	%%zmm5,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm6,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"movslq		0x10(%%r9),%%r10	\n\t"/* off4-7 */\
		"movslq		0x14(%%r9),%%r11	\n\t"\
		"movslq		0x18(%%r9),%%r12	\n\t"\
		"movslq		0x1c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off4-7 */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x200(%%r11)\n\t"\
		"vmovaps	%%zmm0,    (%%r11)	\n\t"\
		"vmovaps	%%zmm3,    (%%r12)	\n\t"\
		"vmovaps	%%zmm2,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%r13)	\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4	\n\t"\
		"vaddpd	%%zmm3,%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm2,%%zmm5,%%zmm5	\n\t"\
		"vaddpd	%%zmm1,%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm4,    (%%r10)	\n\t"\
		"vmovaps	%%zmm7,    (%%r13)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%r12)	\n\t"\
	/* Block 1: Combine 1-output of each radix-4, i.e. inputs from __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
	"prefetcht1	0x200(%%r13)\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm3	/* cc0, using isrt2 as base-ptr */\n\t"\
		"vmovaps	0x80(%%rsi),%%zmm2	/* ss0, using isrt2 as base-ptr */\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmulpd	%%zmm3,%%zmm5,%%zmm5		\n\t"\
		"vmulpd	%%zmm2,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	    (%%rdx),%%zmm0	\n\t"\
		"vmulpd	%%zmm2,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm1	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm0,%%zmm6		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm1,%%zmm7		\n\t"\
		"vmulpd	%%zmm2,%%zmm6,%%zmm6		\n\t"\
		"vmulpd	%%zmm2,%%zmm7,%%zmm7		\n\t"\
		"vmulpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm0,%%zmm7,%%zmm7		\n\t"\
		"vsubpd	%%zmm1,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	%%zmm4,%%zmm2		\n\t"\
		"vmovaps	%%zmm5,%%zmm3		\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm2,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm3,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vmovaps	    (%%rsi),%%zmm1	/* isrt2 */\n\t"\
		"vmovaps	%%zmm2,%%zmm0		\n\t"\
		"vsubpd	%%zmm3,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm0,%%zmm3,%%zmm3		\n\t"\
		"vmulpd	%%zmm1,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm2,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm3,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm0,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"vsubpd	%%zmm6,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm7,%%zmm3,%%zmm3		\n\t"\
		"vsubpd	%%zmm4,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"movslq		0x20(%%r9),%%r10	\n\t"/* off8-b */\
		"movslq		0x24(%%r9),%%r11	\n\t"\
		"movslq		0x28(%%r9),%%r12	\n\t"\
		"movslq		0x2c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off8-b */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x200(%%r11)\n\t"\
		"vmovaps	%%zmm2,    (%%r11)	\n\t"\
		"vmovaps	%%zmm0,    (%%r12)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%r13)	\n\t"\
		"vaddpd	%%zmm2,%%zmm6,%%zmm6	\n\t"\
		"vaddpd	%%zmm0,%%zmm5,%%zmm5	\n\t"\
		"vaddpd	%%zmm3,%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm1,%%zmm4,%%zmm4	\n\t"\
		"vmovaps	%%zmm6,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5,    (%%r13)	\n\t"\
		"vmovaps	%%zmm7,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm4,0x40(%%r12)	\n\t"\
	/* Block 3: Combine 3-output of each radix-4, i.e. inputs from __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
	"prefetcht1	0x200(%%r13)\n\t"\
		"vmovaps	    (%%rcx),%%zmm4	\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm5	\n\t"\
		"vmovaps	0x40(%%rsi),%%zmm2	/* cc0, using isrt2 as base-ptr */\n\t"\
		"vmovaps	0x80(%%rsi),%%zmm3	/* ss0, using isrt2 as base-ptr */\n\t"\
		"vmovaps	%%zmm4,%%zmm6		\n\t"\
		"vmovaps	%%zmm5,%%zmm7		\n\t"\
		"vmulpd	%%zmm3,%%zmm4,%%zmm4		\n\t"\
		"vmulpd	%%zmm3,%%zmm5,%%zmm5		\n\t"\
		"vmulpd	%%zmm2,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	    (%%rdx),%%zmm0	\n\t"\
		"vmulpd	%%zmm2,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm1	\n\t"\
		"vaddpd	%%zmm6,%%zmm5,%%zmm5		\n\t"\
		"vmovaps	%%zmm0,%%zmm6		\n\t"\
		"vsubpd	%%zmm7,%%zmm4,%%zmm4		\n\t"\
		"vmovaps	%%zmm1,%%zmm7		\n\t"\
		"vmulpd	%%zmm2,%%zmm6,%%zmm6		\n\t"\
		"vmulpd	%%zmm2,%%zmm7,%%zmm7		\n\t"\
		"vmulpd	%%zmm3,%%zmm0,%%zmm0		\n\t"\
		"vmulpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm0,%%zmm7,%%zmm7		\n\t"\
		"vsubpd	%%zmm1,%%zmm6,%%zmm6		\n\t"\
		"vmovaps	%%zmm4,%%zmm2		\n\t"\
		"vmovaps	%%zmm5,%%zmm3		\n\t"\
		"vsubpd	%%zmm6,%%zmm4,%%zmm4		\n\t"\
		"vsubpd	%%zmm7,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm2,%%zmm6,%%zmm6		\n\t"\
		"vaddpd	%%zmm3,%%zmm7,%%zmm7		\n\t"\
		"vmovaps	    (%%rbx),%%zmm2	\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm3	\n\t"\
		"vmovaps	    (%%rsi),%%zmm1		/* isrt2 */\n\t"\
		"vmovaps	%%zmm2,%%zmm0		\n\t"\
		"vaddpd	%%zmm3,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm0,%%zmm3,%%zmm3		\n\t"\
		"vmulpd	%%zmm1,%%zmm2,%%zmm2		\n\t"\
		"vmulpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"vmovaps	    (%%rax),%%zmm0	\n\t"\
		"vmovaps	0x40(%%rax),%%zmm1	\n\t"\
		"vsubpd	%%zmm2,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm3,%%zmm1,%%zmm1		\n\t"\
		"vaddpd	%%zmm2,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm3,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm0,%%zmm2,%%zmm2		\n\t"\
		"vaddpd	%%zmm1,%%zmm3,%%zmm3		\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0		\n\t"\
		"vsubpd	%%zmm7,%%zmm2,%%zmm2		\n\t"\
		"vsubpd	%%zmm5,%%zmm1,%%zmm1		\n\t"\
		"vsubpd	%%zmm6,%%zmm3,%%zmm3		\n\t"\
		"vaddpd	%%zmm4,%%zmm4,%%zmm4		\n\t"\
		"vaddpd	%%zmm7,%%zmm7,%%zmm7		\n\t"\
		"vaddpd	%%zmm5,%%zmm5,%%zmm5		\n\t"\
		"vaddpd	%%zmm6,%%zmm6,%%zmm6		\n\t"\
		"movslq		0x30(%%r9),%%r10	\n\t"/* offc-f */\
		"movslq		0x34(%%r9),%%r11	\n\t"\
		"movslq		0x38(%%r9),%%r12	\n\t"\
		"movslq		0x3c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + offc-f */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x200(%%r11)\n\t"\
		"vmovaps	%%zmm0,    (%%r11)	\n\t"\
		"vmovaps	%%zmm2,    (%%r12)	\n\t"\
		"vmovaps	%%zmm1,0x40(%%r11)	\n\t"\
		"vmovaps	%%zmm3,0x40(%%r13)	\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4	\n\t"\
		"vaddpd	%%zmm2,%%zmm7,%%zmm7	\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5	\n\t"\
		"vaddpd	%%zmm3,%%zmm6,%%zmm6	\n\t"\
		"vmovaps	%%zmm4,    (%%r10)	\n\t"\
		"vmovaps	%%zmm7,    (%%r13)	\n\t"\
		"vmovaps	%%zmm5,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm6,0x40(%%r12)	\n\t"\
	"prefetcht1	0x200(%%r13)\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__off] "m" (Xoff)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__c1] "m" (Xc1)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r8","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// Here are the array-of-doubles index offsets w.r.to the __c = cc0 base-root address of the various derived sincos terms:
	// Datum	Offset	Datum	Offset
	// ------	------	------	------
	// __c1_c	0x00	__sc	0x01
	// __c1i2	0x02	__c2i2	0x03
	// __c8		0x04	__r8	0x05
	// __c4		0x06	__r4	0x07
	// __cC4	0x08	__rC	0x09
	// __c2		0x0a	__r2	0x0b
	// __cA2	0x0c	__rA	0x0d
	// __c62	0x0e	__r6	0x0f
	// __cE6	0x10	__rE	0x11
	// __c1		0x12	__r1	0x13
	// __c91	0x14	__r9	0x15
	// __c51	0x16	__r5	0x17
	// __cD5	0x18	__rD	0x19
	// __c31	0x1a	__r3	0x1b
	// __cB3	0x1c	__rB	0x1d
	// __c73	0x1e	__r7	0x1f
	// __cF7	0x20	__rF	0x21

	// Remember that for AVX2-style 3-operand FMA in AT&T syntax, the result overwrites the rightmost input!
	#define SSE2_RADIX16_DIF_FMA_OOP(Xin0,Xi1,Xi2,Xi3,Xi4, Xout0,Xout1,Xout2,Xout3,Xout4,Xout5,Xout6,Xout7,Xout8,Xout9,Xouta,Xoutb,Xoutc,Xoutd,Xoute,Xoutf, Xcc0)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i2](%%rax),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i3](%%rax),%%rdx	\n\t"/* __in0 + 3*istride */\
		"movq	%[__cc0],%%rsi 			\n\t"\
		"vbroadcastsd 0x28(%%rsi),%%zmm13 \n\t vbroadcastsd 0x38(%%rsi),%%zmm14 \n\t vbroadcastsd 0x48(%%rsi),%%zmm15 \n\t"/* load __r8,r4,rC into zmm13-15 */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			0x040(%%rcx),%%zmm5 		\n\t"/*	t04 =__A8r;					t05 =__A8i; */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			0x040(%%rax),%%zmm1 		\n\t"/*	t00 =__A0r;					t01 =__A0i; */\
		"vmovaps		%%zmm4,%%zmm6				\n\t"/*	t06 = t04; */\
		"vfnmadd231pd	%%zmm5 ,%%zmm13,%%zmm4 		\n\t	 vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm5 		\n\t"/*	FNMA231(  t05,__r8,t04);	 FMA231(  t06,__r8,t05); */\
		"vmovaps		     (%%rbx),%%zmm8			\n\t	vmovaps			0x040(%%rbx),%%zmm9 		\n\t"/*	_a =__A4r;					_b =__A4i; */\
		"vfnmadd231pd	0x040(%%rbx),%%zmm14,%%zmm8 \n\t	 vfmadd231pd	     (%%rbx),%%zmm14,%%zmm9 \n\t"/*	FNMA231(__A4i,__r4,_a );	 FMA231(__A4r,__r4,_b ); */\
		"vbroadcastsd	0x040(%%rsi),%%zmm13		\n\t	vbroadcastsd	0x020(%%rsi),%%zmm14		\n\t"/* load __cC4,c8 into pair of regs */\
		"vmovaps		     (%%rdx),%%zmm6			\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*	t06 =__ACr;					t07 =__ACi; */\
		"vfnmadd231pd	0x040(%%rdx),%%zmm15,%%zmm6 \n\t	 vfmadd231pd	     (%%rdx),%%zmm15,%%zmm7 \n\t"/*	FNMA231(__ACi,__rC,t06);	 FMA231(__ACr,__rC,t07); */\
		"vbroadcastsd	0x030(%%rsi),%%zmm15		\n\t"/* load __c4 */\
		"vmovaps		%%zmm8 ,%%zmm10				\n\t	vmovaps			%%zmm0,%%zmm2 				\n\t"/*	_c = _a;	t02 = t00; */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm8 		\n\t	 vfmadd231pd	%%zmm4 ,%%zmm14,%%zmm0 		\n\t"/*	 FMA231(t06,__cC4,_a);		 FMA231(t04,__c8,t00); */\
		"vmovaps		%%zmm9 ,%%zmm11				\n\t	vmovaps			%%zmm1 ,%%zmm3 				\n\t"/*	_d = _b;	t03 = t01; */\
		" vfmadd231pd	%%zmm7 ,%%zmm13,%%zmm9 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm14,%%zmm1 		\n\t"/*	 FMA231(t07,__cC4,_b);		 FMA231(t05,__c8,t01); */\
		"vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm10		\n\t	vfnmadd231pd	%%zmm4 ,%%zmm14,%%zmm2 		\n\t"/*	FNMA231(t06,__cC4,_c);		FNMA231(t04,__c8,t02); */\
		"vfnmadd231pd	%%zmm7 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm14,%%zmm3 		\n\t"/*	FNMA231(t07,__cC4,_d);		FNMA231(t05,__c8,t03); */\
		"vmovaps		%%zmm0 ,%%zmm4 				\n\t	vmovaps			%%zmm1 ,%%zmm5 				\n\t"/*	t04 =t00; t05 =t01; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm15,%%zmm4 		\n\t	 vfmadd231pd	%%zmm8 ,%%zmm15,%%zmm0 		\n\t"/*	FNMA231(_a ,__c4 ,t04);		 FMA231(_a ,__c4 ,t00); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm15,%%zmm5 		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm15,%%zmm1 		\n\t"/*	FNMA231(_b ,__c4 ,t05);		 FMA231(_b ,__c4 ,t01); */\
		"vmovaps		%%zmm2 ,%%zmm6 				\n\t	vmovaps			%%zmm3 ,%%zmm7 				\n\t"/*	t06 =t02;	t07 =t03; */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm6 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t"/*	 FMA231(_d ,__c4 ,t06);		FNMA231(_d ,__c4 ,t02); */\
		"vfnmadd231pd	%%zmm10,%%zmm15,%%zmm7 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t"/*	FNMA231(_c ,__c4 ,t07);		 FMA231(_c ,__c4 ,t03); */\
		"vmovaps		%%zmm4 ,     (%%rcx)		\n\t	vmovaps			%%zmm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%zmm5 ,0x040(%%rcx)		\n\t	vmovaps			%%zmm1 ,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm6 ,     (%%rdx)		\n\t	vmovaps			%%zmm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%zmm7 ,0x040(%%rdx)		\n\t	vmovaps			%%zmm3 ,0x040(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%zmm12 	\n\t"/* load __r2 into zmm12 */\
		"vbroadcastsd 0x028(%%rsi),%%zmm13 	\n\t"/* load __rA into zmm13 */\
		"vbroadcastsd 0x038(%%rsi),%%zmm14 	\n\t"/* load __r6 into zmm14 */\
		"vbroadcastsd 0x048(%%rsi),%%zmm15 	\n\t"/* load __rE into zmm15 */\
		"vmovaps (%%rax),%%zmm0  \n\t vmovaps 0x040(%%rax),%%zmm1  \n\t"/* t08 =__A2r; t09 =__A2i; */\
		"vmovaps (%%rcx),%%zmm4  \n\t vmovaps 0x040(%%rcx),%%zmm5  \n\t"/* t12 =__AAr; t13 =__AAi; */\
		"vmovaps (%%rbx),%%zmm8  \n\t vmovaps 0x040(%%rbx),%%zmm9  \n\t"/* _a  =__A6r; _b  =__A6i; */\
		"vmovaps (%%rdx),%%zmm6  \n\t vmovaps 0x040(%%rdx),%%zmm7  \n\t"/* t14 =__AEr; t15 =__AEi; */\
		"vfnmadd231pd	0x040(%%rax),%%zmm12,%%zmm0 \n\t	 vfmadd231pd	(%%rax),%%zmm12,%%zmm1  	\n\t"/* FNMA231(__A2i,__r2,t08); FMA231(__A2r,__r2,t09); */\
		"vfnmadd231pd	0x040(%%rcx),%%zmm13,%%zmm4 \n\t	 vfmadd231pd	(%%rcx),%%zmm13,%%zmm5  	\n\t"/* FNMA231(__AAi,__rA,t12); FMA231(__AAr,__rA,t13); */\
		"vbroadcastsd	0x040(%%rsi),%%zmm13		\n\t"/* load __cE6 */\
		"vfnmadd231pd	0x040(%%rbx),%%zmm14,%%zmm8 \n\t	 vfmadd231pd	(%%rbx),%%zmm14,%%zmm9  	\n\t"/* FNMA231(__A6i,__r6,_a ); FMA231(__A6r,__r6,_b ); */\
		"vbroadcastsd	0x020(%%rsi),%%zmm14		\n\t"/* load __cA2 */\
		"vfnmadd231pd	0x040(%%rdx),%%zmm15,%%zmm6 \n\t	 vfmadd231pd	(%%rdx),%%zmm15,%%zmm7  	\n\t"/* FNMA231(__AEi,__rE,t14); FMA231(__AEr,__rE,t15); */\
		"vbroadcastsd	0x030(%%rsi),%%zmm15		\n\t"/* load __c62 */\
		"vmovaps		%%zmm8 ,%%zmm10				\n\t	vmovaps			%%zmm0,%%zmm2 				\n\t"/*	_c = _a;	t10 = t08; */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm8 		\n\t	 vfmadd231pd	%%zmm4 ,%%zmm14,%%zmm0 		\n\t"/*	 FMA231(t14,__cE6,_a);		 FMA231(t12,__cA2,t08); */\
		"vmovaps		%%zmm9 ,%%zmm11				\n\t	vmovaps			%%zmm1 ,%%zmm3 				\n\t"/*	_d = _b;	t11 = t09; */\
		" vfmadd231pd	%%zmm7 ,%%zmm13,%%zmm9 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm14,%%zmm1 		\n\t"/*	 FMA231(t15,__cE6,_b);		 FMA231(t13,__cA2,t09); */\
		"vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm10		\n\t	vfnmadd231pd	%%zmm4 ,%%zmm14,%%zmm2 		\n\t"/*	FNMA231(t14,__cE6,_c);		FNMA231(t12,__cA2,t10); */\
		"vfnmadd231pd	%%zmm7 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm14,%%zmm3 		\n\t"/*	FNMA231(t15,__cE6,_d);		FNMA231(t13,__cA2,t11); */\
		"vmovaps		%%zmm0 ,%%zmm4 				\n\t	vmovaps			%%zmm1 ,%%zmm5 				\n\t"/*	t12 =t08 ;	t13 =t09; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm15,%%zmm4 		\n\t	 vfmadd231pd	%%zmm8 ,%%zmm15,%%zmm0 		\n\t"/*	FNMA231(_a,__c62,t12);		 FMA231( _a,__c62,t08); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm15,%%zmm5 		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm15,%%zmm1 		\n\t"/*	FNMA231(_b,__c62,t13);		 FMA231( _b,__c62,t09); */\
		"vmovaps		%%zmm2 ,%%zmm6 				\n\t	vmovaps			%%zmm3 ,%%zmm7 				\n\t"/*	t14 =t10;	t15 =t11; */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm6 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t"/*	 FMA231(_d,__c62,t14);		FNMA231( _d,__c62,t10); */\
		"vfnmadd231pd	%%zmm10,%%zmm15,%%zmm7 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t"/*	FNMA231(_c,__c62,t15);		 FMA231( _c,__c62,t11); */\
		"vmovaps		%%zmm4 ,     (%%rcx)		\n\t	vmovaps			%%zmm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%zmm5 ,0x040(%%rcx)		\n\t	vmovaps			%%zmm1 ,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm6 ,     (%%rdx)		\n\t	vmovaps			%%zmm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%zmm7 ,0x040(%%rdx)		\n\t	vmovaps			%%zmm3 ,0x040(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%zmm12 	\n\t"/* load __r1 into zmm12 */\
		"vbroadcastsd 0x028(%%rsi),%%zmm13 	\n\t"/* load __r9 into zmm13 */\
		"vbroadcastsd 0x038(%%rsi),%%zmm14 	\n\t"/* load __r5 into zmm14 */\
		"vbroadcastsd 0x048(%%rsi),%%zmm15 	\n\t"/* load __rD into zmm15 */\
		"vmovaps (%%rax),%%zmm0  \n\t vmovaps 0x040(%%rax),%%zmm1  \n\t"/* t16 =__A1r;	t17 =__A1i; */\
		"vmovaps (%%rcx),%%zmm4  \n\t vmovaps 0x040(%%rcx),%%zmm5  \n\t"/* t20 =__A9r;	t21 =__A9i; */\
		"vmovaps (%%rbx),%%zmm8  \n\t vmovaps 0x040(%%rbx),%%zmm9  \n\t"/* _a=  __A5r;	_b  =__A5i; */\
		"vmovaps (%%rdx),%%zmm6  \n\t vmovaps 0x040(%%rdx),%%zmm7  \n\t"/* t22 =__ADr;	t23 =__ADi; */\
		"vfnmadd231pd	0x040(%%rax),%%zmm12,%%zmm0 \n\t	 vfmadd231pd	(%%rax),%%zmm12,%%zmm1  	\n\t"/* FNMA231(__A1i,__r1,t16);	 FMA231(__A1r,__r1,t17); */\
		"vfnmadd231pd	0x040(%%rcx),%%zmm13,%%zmm4 \n\t	 vfmadd231pd	(%%rcx),%%zmm13,%%zmm5  	\n\t"/* FNMA231(__A9i,__r9,t20);	 FMA231(__A9r,__r9,t21); */\
		"vbroadcastsd	0x040(%%rsi),%%zmm13		\n\t"/* load __cD5 */\
		"vfnmadd231pd	0x040(%%rbx),%%zmm14,%%zmm8 \n\t	 vfmadd231pd	(%%rbx),%%zmm14,%%zmm9  	\n\t"/* FNMA231(__A5i,__r5,_a );	 FMA231(__A5r,__r5,_b ); */\
		"vbroadcastsd	0x020(%%rsi),%%zmm14		\n\t"/* load __c91 */\
		"vfnmadd231pd	0x040(%%rdx),%%zmm15,%%zmm6 \n\t	 vfmadd231pd	(%%rdx),%%zmm15,%%zmm7  	\n\t"/* FNMA231(__ADi,__rD,t22);	 FMA231(__ADr,__rD,t23); */\
		"vbroadcastsd	0x030(%%rsi),%%zmm15		\n\t"/* load __c51 */\
		"vmovaps		%%zmm8 ,%%zmm10				\n\t	vmovaps			%%zmm0,%%zmm2 				\n\t"/*	_c= _a;	t18= t16; */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm8 		\n\t	 vfmadd231pd	%%zmm4 ,%%zmm14,%%zmm0 		\n\t"/*	 FMA231(t22,__cD5,_a);		 FMA231(t20,__c91,t16); */\
		"vmovaps		%%zmm9 ,%%zmm11				\n\t	vmovaps			%%zmm1 ,%%zmm3 				\n\t"/*	_d= _b;	t19= t17; */\
		" vfmadd231pd	%%zmm7 ,%%zmm13,%%zmm9 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm14,%%zmm1 		\n\t"/*	 FMA231(t23,__cD5,_b);		 FMA231(t21,__c91,t17); */\
		"vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm10		\n\t	vfnmadd231pd	%%zmm4 ,%%zmm14,%%zmm2 		\n\t"/*	FNMA231(t22,__cD5,_c);		FNMA231(t20,__c91,t18); */\
		"vfnmadd231pd	%%zmm7 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm14,%%zmm3 		\n\t"/*	FNMA231(t23,__cD5,_d);		FNMA231(t21,__c91,t19); */\
		"vmovaps		%%zmm0 ,%%zmm4 				\n\t	vmovaps			%%zmm1 ,%%zmm5 				\n\t"/*	t20 =t16;	t21 =t17; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm15,%%zmm4 		\n\t	 vfmadd231pd	%%zmm8 ,%%zmm15,%%zmm0 		\n\t"/*	FNMA231(_a,__c51,t20);		 FMA231(_a,__c51,t16); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm15,%%zmm5 		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm15,%%zmm1 		\n\t"/*	FNMA231(_b,__c51,t21);		 FMA231(_b,__c51,t17); */\
		"vmovaps		%%zmm2 ,%%zmm6 				\n\t	vmovaps			%%zmm3 ,%%zmm7 				\n\t"/*	t22 =t18;	t23 =t19; */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm6 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t"/*	 FMA231(_d,__c51,t22);		FNMA231(_d,__c51,t18); */\
		"vfnmadd231pd	%%zmm10,%%zmm15,%%zmm7 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t"/*	FNMA231(_c,__c51,t23);		 FMA231(_c,__c51,t19); */\
		"vmovaps		%%zmm4 ,     (%%rcx)		\n\t	vmovaps			%%zmm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%zmm5 ,0x040(%%rcx)		\n\t	vmovaps			%%zmm1 ,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm6 ,     (%%rdx)		\n\t	vmovaps			%%zmm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%zmm7 ,0x040(%%rdx)		\n\t	vmovaps			%%zmm3 ,0x040(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%zmm12 	\n\t"/* load __r3 into zmm12 */\
		"vbroadcastsd 0x028(%%rsi),%%zmm13 	\n\t"/* load __rB into zmm13 */\
		"vbroadcastsd 0x038(%%rsi),%%zmm14 	\n\t"/* load __r7 into zmm14 */\
		"vbroadcastsd 0x048(%%rsi),%%zmm15 	\n\t"/* load __rF into zmm15 */\
		"vmovaps (%%rax),%%zmm0  \n\t vmovaps 0x040(%%rax),%%zmm1  \n\t"/* t24 =__A3r;	t25 =__A3i; */\
		"vmovaps (%%rcx),%%zmm4  \n\t vmovaps 0x040(%%rcx),%%zmm5  \n\t"/* t28 =__ABr;	t29 =__ABi; */\
		"vmovaps (%%rbx),%%zmm8  \n\t vmovaps 0x040(%%rbx),%%zmm9  \n\t"/* _a  =__A7r;	_b  =__A7i; */\
		"vmovaps (%%rdx),%%zmm6  \n\t vmovaps 0x040(%%rdx),%%zmm7  \n\t"/* t30 =__AFr;	t31 =__AFi; */\
		"vfnmadd231pd	0x040(%%rax),%%zmm12,%%zmm0 \n\t	 vfmadd231pd	(%%rax),%%zmm12,%%zmm1  	\n\t"/* FNMA231(__A3i,__r3,t24);	 FMA231(__A3r,__r3,t25); */\
		"vfnmadd231pd	0x040(%%rcx),%%zmm13,%%zmm4 \n\t	 vfmadd231pd	(%%rcx),%%zmm13,%%zmm5  	\n\t"/* FNMA231(__ABi,__rB,t28 );	 FMA231(__ABr,__rB,t29 ); */\
		"vbroadcastsd	0x040(%%rsi),%%zmm13		\n\t"/* load __cF7 */\
		"vfnmadd231pd	0x040(%%rbx),%%zmm14,%%zmm8 \n\t	 vfmadd231pd	(%%rbx),%%zmm14,%%zmm9  	\n\t"/* FNMA231(__A7i,__r7,_a);		 FMA231(__A7r,__r7,_b); */\
		"vbroadcastsd	0x020(%%rsi),%%zmm14		\n\t"/* load __cB3 */\
		"vfnmadd231pd	0x040(%%rdx),%%zmm15,%%zmm6 \n\t	 vfmadd231pd	(%%rdx),%%zmm15,%%zmm7  	\n\t"/* FNMA231(__AFi,__rF,t30 );	 FMA231(__AFr,__rF,t31 ); */\
		"vbroadcastsd	0x030(%%rsi),%%zmm15		\n\t"/* load __c73 */\
		"vmovaps		%%zmm8 ,%%zmm10				\n\t	vmovaps			%%zmm0,%%zmm2 				\n\t"/*	_c= _a;	t26= t24; */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm8 		\n\t	 vfmadd231pd	%%zmm4 ,%%zmm14,%%zmm0 		\n\t"/*	 FMA231(t30,__cF7,_a);		 FMA231(t28,__cB3,t24); */\
		"vmovaps		%%zmm9 ,%%zmm11				\n\t	vmovaps			%%zmm1 ,%%zmm3 				\n\t"/*	_d= _b;	t27= t25; */\
		" vfmadd231pd	%%zmm7 ,%%zmm13,%%zmm9 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm14,%%zmm1 		\n\t"/*	 FMA231(t31,__cF7,_b);		 FMA231(t29,__cB3,t25); */\
		"vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm10		\n\t	vfnmadd231pd	%%zmm4 ,%%zmm14,%%zmm2 		\n\t"/*	FNMA231(t30,__cF7,_c);		FNMA231(t28,__cB3,t26); */\
		"vfnmadd231pd	%%zmm7 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm14,%%zmm3 		\n\t"/*	FNMA231(t31,__cF7,_d);		FNMA231(t29,__cB3,t27); */\
		"vmovaps		%%zmm0 ,%%zmm4 				\n\t	vmovaps			%%zmm1 ,%%zmm5 				\n\t"/*	t28 =t24;	t29 =t25; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm15,%%zmm4 		\n\t	 vfmadd231pd	%%zmm8 ,%%zmm15,%%zmm0 		\n\t"/*	FNMA231(_a,__c73,t28);		 FMA231(_a,__c73,t24); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm15,%%zmm5 		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm15,%%zmm1 		\n\t"/*	FNMA231(_b,__c73,t29);		 FMA231(_b,__c73,t25); */\
		"vmovaps		%%zmm2 ,%%zmm6 				\n\t	vmovaps			%%zmm3 ,%%zmm7 				\n\t"/*	t30 =t26;	t31 =t27; */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm6 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t"/*	 FMA231(_d,__c73,t30);		FNMA231(_d,__c73,t26); */\
		"vfnmadd231pd	%%zmm10,%%zmm15,%%zmm7 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t"/*	FNMA231(_c,__c73,t31);		 FMA231(_c,__c73,t27); */\
		"vmovaps		%%zmm4 ,     (%%rcx)		\n\t	vmovaps			%%zmm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%zmm5 ,0x040(%%rcx)		\n\t	vmovaps			%%zmm1 ,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm6 ,     (%%rdx)		\n\t	vmovaps			%%zmm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%zmm7 ,0x040(%%rdx)		\n\t	vmovaps			%%zmm3 ,0x040(%%rbx)		\n\t"\
		"\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"subq	$0x0c0,%%rsi 		/* revert cc-ptr to base value */\n\t"\
		"\n\t"\
		/*...Read t0,8,16,24 from local store ... Do the 4 Im-part FMAs first, because their results needed 1st below */\
		"vbroadcastsd	0x050(%%rsi),%%zmm14		\n\t	vbroadcastsd	0x0d0(%%rsi),%%zmm15		\n\t"/* load __c2,c31 into pair of regs */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			     (%%rbx),%%zmm2 		\n\t"/*    t00;    t08; */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			     (%%rdx),%%zmm6 		\n\t"/*    t16;    t24; */\
		"vmovaps			 %%zmm0 ,%%zmm8 		\n\t	vmovaps				 %%zmm4 ,%%zmm10		\n\t"/* _a=t00; _c=t16; */\
		" vfmadd231pd	%%zmm2 ,%%zmm14,%%zmm0 		\n\t	 vfmadd231pd	%%zmm6 ,%%zmm15,%%zmm4 		\n\t"/*	 FMA231(t08,__c2 ,t00);		 FMA231(t24,__c31,t16); */\
		"vmovaps		0x040(%%rax),%%zmm1 		\n\t	vmovaps			0x040(%%rbx),%%zmm3 		\n\t"/*    t01;    t09; */\
		"vmovaps		0x040(%%rcx),%%zmm5 		\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*    t17;    t25; */\
		"vmovaps			 %%zmm1 ,%%zmm9 		\n\t	vmovaps				 %%zmm5 ,%%zmm11		\n\t"/* _b=t01; _d=t17; */\
		" vfmadd231pd	%%zmm3 ,%%zmm14,%%zmm1 		\n\t	 vfmadd231pd	%%zmm7 ,%%zmm15,%%zmm5 		\n\t"/*	 FMA231(t09,__c2 ,t01);		 FMA231(t25,__c31,t17); */\
		"vfnmadd231pd	%%zmm2 ,%%zmm14,%%zmm8 		\n\t	vfnmadd231pd	%%zmm6 ,%%zmm15,%%zmm10		\n\t"/*	FNMA231(t08,__c2 ,_a );		FNMA231(t24,__c31,_c ); */\
		"vbroadcastsd	0x090(%%rsi),%%zmm6 		\n\t"/* load __c1 */\
		"vfnmadd231pd	%%zmm3 ,%%zmm14,%%zmm9 		\n\t	vfnmadd231pd	%%zmm7 ,%%zmm15,%%zmm11		\n\t"/*	FNMA231(t09,__c2 ,_b );		FNMA231(t25,__c31,_d ); */\
		"vmovaps			 %%zmm0 ,%%zmm12		\n\t	vmovaps				 %%zmm1 ,%%zmm13		\n\t"/* _e = t00; _f = t01; */\
		" vfmadd231pd	%%zmm4 ,%%zmm6 ,%%zmm0 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm6 ,%%zmm1 		\n\t"/*	 FMA231(t16,__c1 ,t00);		 FMA231(t17,__c1 ,t01); */\
		"vfnmadd231pd	%%zmm4 ,%%zmm6 ,%%zmm12		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm6 ,%%zmm13		\n\t"/*	FNMA231(t16,__c1 ,_e );		FNMA231(t17,__c1 ,_f ); */\
		"vmovaps			 %%zmm8 ,%%zmm2 		\n\t	vmovaps				 %%zmm9 ,%%zmm3 		\n\t"/* t08 = _a ; t09 = _b; */\
		"vfnmadd231pd	%%zmm11,%%zmm6 ,%%zmm2 		\n\t	 vfmadd231pd	%%zmm10,%%zmm6 ,%%zmm3 		\n\t"/*	FNMA231(_d ,__c1 ,t08);		 FMA231(_c ,__c1 ,t09); */\
		" vfmadd231pd	%%zmm11,%%zmm6 ,%%zmm8 		\n\t	vfnmadd231pd	%%zmm10,%%zmm6 ,%%zmm9 		\n\t"/*	 FMA231(_d ,__c1 ,_a );		FNMA231(_c ,__c1 ,_b ); */\
		/* Write outputs: */\
		"movq	%[__out0],%%r10		\n\t"\
		"movq	%[__out1],%%r11		\n\t"\
		"movq	%[__out2],%%r12		\n\t"\
		"movq	%[__out3],%%r13		\n\t"\
		"vmovaps		%%zmm0 ,     (%%r10)		\n\t	vmovaps			%%zmm1 ,0x040(%%r10)		\n\t"/* __B0r= t00;		__B0i= t01; */\
		"vmovaps		%%zmm12,     (%%r11)		\n\t	vmovaps			%%zmm13,0x040(%%r11)		\n\t"/* __B1r= _e ;		__B1i= _f ; */\
		"vmovaps		%%zmm2 ,     (%%r12)		\n\t	vmovaps			%%zmm3 ,0x040(%%r12)		\n\t"/* __B2r= t08;		__B2i= t09; */\
		"vmovaps		%%zmm8 ,     (%%r13)		\n\t	vmovaps			%%zmm9 ,0x040(%%r13)		\n\t"/* __B3r= _a ;		__B3i= _b ; */\
		"\n\t"\
		/*...Block 2: t4,12,20,28 */\
		"vbroadcastsd	0x110(%%rsi),%%zmm13	\n\t"/* cc0 + 0x22 = __two; Actually holds 1.0 in AVX2 mode */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
		"vbroadcastsd	0x050(%%rsi),%%zmm14		\n\t	vbroadcastsd	0x0d0(%%rsi),%%zmm15		\n\t"/* load __c2,31 into pair of regs */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			0x040(%%rcx),%%zmm5 		\n\t"/*    t20;    t21; */\
		"vmovaps		     (%%rdx),%%zmm6 		\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*    t28;    t29; */\
		"vmovaps			 %%zmm4 ,%%zmm10 		\n\t	vmovaps				 %%zmm7 ,%%zmm11		\n\t"/* _c=t20; _d=t29; */\
		"vfnmadd231pd	%%zmm5 ,%%zmm13,%%zmm10		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm13,%%zmm4 		\n\t"/*	FNMA231(t21,1.0,_c );		 FMA231(t21,1.0,t20); */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm7 		\n\t"/*	 FMA231(t28,1.0,_d );		FNMA231(t28,1.0,t29); */\
		"vbroadcastsd	0x010(%%rsi),%%zmm13		\n\t"/* load __c1i2 */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			0x040(%%rax),%%zmm1 		\n\t"/*    t04;    t05; */\
		"vmovaps		     (%%rbx),%%zmm2 		\n\t	vmovaps			0x040(%%rbx),%%zmm3 		\n\t"/*    t12;    t13; */\
		"vmovaps			 %%zmm0 ,%%zmm8 		\n\t	vmovaps				 %%zmm1 ,%%zmm9 		\n\t"/* _a = t04; _b = t05; */\
		"vmovaps			 %%zmm10,%%zmm5 		\n\t	vmovaps				 %%zmm4 ,%%zmm12		\n\t"/* t21 = _c; _e = t20; */\
		" vfmadd231pd	%%zmm3 ,%%zmm14,%%zmm8 		\n\t	 vfmadd231pd	%%zmm11,%%zmm15,%%zmm5 		\n\t"/*	 FMA231(t13,__c2 ,_a );		 FMA231(_d ,__c31,t21); */\
		"vfnmadd231pd	%%zmm2 ,%%zmm14,%%zmm9 		\n\t	 vfmadd231pd	%%zmm7 ,%%zmm15,%%zmm4 		\n\t"/*	FNMA231(t12,__c2 ,_b );		 FMA231(t29,__c31,t20); */\
		"vfnmadd231pd	%%zmm3 ,%%zmm14,%%zmm0 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm10		\n\t"/*	FNMA231(t13,__c2 ,t04);		FNMA231(_d ,__c31,_c ); */\
		" vfmadd231pd	%%zmm2 ,%%zmm14,%%zmm1 		\n\t	vfnmadd231pd	%%zmm7 ,%%zmm15,%%zmm12		\n\t"/*	 FMA231(t12,__c2 ,t05);		FNMA231(t29,__c31,_e ); */\
		"vmovaps			 %%zmm8 ,%%zmm2 		\n\t	vmovaps				 %%zmm9 ,%%zmm3 		\n\t"/* t12 = _a; t13 = _b; */\
		"vfnmadd231pd	%%zmm4 ,%%zmm13,%%zmm2 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm13,%%zmm3 		\n\t"/*	FNMA231(t20,__c1i2,t12);	 FMA231(t21,__c1i2,t13); */\
		" vfmadd231pd	%%zmm4 ,%%zmm13,%%zmm8 		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm13,%%zmm9 		\n\t"/*	 FMA231(t20,__c1i2,_a );	FNMA231(t21,__c1i2,_b ); */\
		"vmovaps			 %%zmm0 ,%%zmm11		\n\t	vmovaps				 %%zmm1 ,%%zmm7 		\n\t"/* _d = t04; t29 = t05; */\
		" vfmadd231pd	%%zmm10,%%zmm13,%%zmm0 		\n\t	 vfmadd231pd	%%zmm12,%%zmm13,%%zmm1 		\n\t"/*	 FMA231(_c ,__c1i2,t04);	 FMA231(_e ,__c1i2,t05); */\
		"vfnmadd231pd	%%zmm10,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm12,%%zmm13,%%zmm7 		\n\t"/*	FNMA231(_c ,__c1i2,_d );	FNMA231(_e ,__c1i2,t29); */\
		/* Write outputs: */\
		"movq	%[__out4],%%r10		\n\t"\
		"movq	%[__out5],%%r11		\n\t"\
		"movq	%[__out6],%%r12		\n\t"\
		"movq	%[__out7],%%r13		\n\t"\
		"vmovaps		%%zmm2 ,     (%%r12)		\n\t	vmovaps			%%zmm3 ,0x040(%%r12)		\n\t"/* __B6r= t12;		__B6i= t13; */\
		"vmovaps		%%zmm8 ,     (%%r13)		\n\t	vmovaps			%%zmm9 ,0x040(%%r13)		\n\t"/* __B7r= _a ;		__B7i= _b ; */\
		"vmovaps		%%zmm0 ,     (%%r10)		\n\t	vmovaps			%%zmm1 ,0x040(%%r10)		\n\t"/* __B4r= t04;		__B4i= t05; */\
		"vmovaps		%%zmm11,     (%%r11)		\n\t	vmovaps			%%zmm7 ,0x040(%%r11)		\n\t"/* __B5r= _d ;		__B5i= t29; */\
		"\n\t"\
		/*...Block 1: t2,10,18,26 */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
		"vmovaps		     (%%rbx),%%zmm2 		\n\t	vmovaps			0x040(%%rbx),%%zmm3 		\n\t"/*    t10;    t11; */\
		"vbroadcastsd	0x008(%%rsi),%%zmm15		\n\t"/* load __sc  */\
		"vbroadcastsd	0x0d0(%%rsi),%%zmm14		\n\t"/* load __c31 */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			0x040(%%rcx),%%zmm5 		\n\t"/*    t18;    t19; */\
		"vmovaps		     (%%rdx),%%zmm6 		\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*    t26;    t27; */\
		"vsubpd		%%zmm3 ,%%zmm2 ,%%zmm12 		\n\t"/* _e = t10-t11; */\
		"vaddpd		%%zmm2 ,%%zmm3 ,%%zmm13			\n\t"/* _f = t10+t11; */\
		"vmovaps			 %%zmm4 ,%%zmm10 		\n\t	vmovaps				 %%zmm7 ,%%zmm8 		\n\t"/* _c = t18; _a = t27; */\
		"vfnmadd231pd	%%zmm5 ,%%zmm15,%%zmm10		\n\t	 vfmsub231pd	%%zmm6 ,%%zmm15,%%zmm8 		\n\t"/*	FNMA231(t19,__sc,_c );		 FMS231(t26,__sc,_a ); */\
		"vmovaps			 %%zmm5 ,%%zmm11 		\n\t	vmovaps				 %%zmm6 ,%%zmm9 		\n\t"/* _d = t19; _b = t26; */\
		"vbroadcastsd	0x018(%%rsi),%%zmm6 		\n\t"/* load __c2i2 */\
		" vfmadd231pd	%%zmm4 ,%%zmm15,%%zmm11		\n\t	 vfmadd231pd	%%zmm7 ,%%zmm15,%%zmm9 		\n\t"/*	 FMA231(t18,__sc,_d );		 FMA231(t27,__sc,_b ); */\
		"vbroadcastsd	(%%rsi),%%zmm15				\n\t"/* load __c1_c */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			0x040(%%rax),%%zmm1 		\n\t"/*    t02;    t03; */\
		"vmovaps			 %%zmm10,%%zmm4 		\n\t	vmovaps				 %%zmm0 ,%%zmm2 		\n\t"/* t18 = _c;	t10 = t02; */\
		" vfmadd231pd	%%zmm8 ,%%zmm14,%%zmm4 		\n\t	 vfmadd231pd	%%zmm12,%%zmm6 ,%%zmm0 		\n\t"/*	 FMA231(_a ,__c31,t18);		 FMA231(_e ,__c2i2,t02); */\
		"vmovaps			 %%zmm11,%%zmm5 		\n\t	vmovaps				 %%zmm1 ,%%zmm3 		\n\t"/* t19 = _d;	t11 = t03; */\
		" vfmadd231pd	%%zmm9 ,%%zmm14,%%zmm5 		\n\t	 vfmadd231pd	%%zmm13,%%zmm6 ,%%zmm1 		\n\t"/*	 FMA231(_b ,__c31,t19);		 FMA231(_f ,__c2i2,t03); */\
		"vfnmadd231pd	%%zmm8 ,%%zmm14,%%zmm10		\n\t	vfnmadd231pd	%%zmm12,%%zmm6 ,%%zmm2 		\n\t"/*	FNMA231(_a ,__c31,_c );		FNMA231(_e ,__c2i2,t10); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm14,%%zmm11		\n\t	vfnmadd231pd	%%zmm13,%%zmm6 ,%%zmm3 		\n\t"/*	FNMA231(_b ,__c31,_d );		FNMA231(_f ,__c2i2,t11); */\
		"vmovaps			 %%zmm0 ,%%zmm8 		\n\t	vmovaps				 %%zmm1 ,%%zmm9 		\n\t"/* _a = t02; _b = t03; */\
		" vfmadd231pd	%%zmm4 ,%%zmm15,%%zmm0 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm15,%%zmm1 		\n\t"/*	 FMA231(t18,__c1_c,t02);	 FMA231(t19,__c1_c,t03); */\
		"vfnmadd231pd	%%zmm4 ,%%zmm15,%%zmm8 		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm15,%%zmm9 		\n\t"/*	FNMA231(t18,__c1_c,_a );	FNMA231(t19,__c1_c,_b ); */\
		"vmovaps			 %%zmm2 ,%%zmm12		\n\t	vmovaps				 %%zmm3 ,%%zmm13		\n\t"/* _e = t10; _f = t11; */\
		"vfnmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t"/*	FNMA231(_d ,__c1_c,t10);	 FMA231(_c ,__c1_c,t11); */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm12		\n\t	vfnmadd231pd	%%zmm10,%%zmm15,%%zmm13		\n\t"/*	 FMA231(_d ,__c1_c,_e );	FNMA231(_c ,__c1_c,_f ); */\
		/* Write outputs: */\
		"movq	%[__out8],%%r10		\n\t"\
		"movq	%[__out9],%%r11		\n\t"\
		"movq	%[__outa],%%r12		\n\t"\
		"movq	%[__outb],%%r13		\n\t"\
		"vmovaps		%%zmm0 ,     (%%r10)		\n\t	vmovaps			%%zmm1 ,0x040(%%r10)		\n\t"/* __B8r= t02;		__B8i= t03; */\
		"vmovaps		%%zmm8 ,     (%%r11)		\n\t	vmovaps			%%zmm9 ,0x040(%%r11)		\n\t"/* __B9r= _a ;		__B9i= _b ; */\
		"vmovaps		%%zmm2 ,     (%%r12)		\n\t	vmovaps			%%zmm3 ,0x040(%%r12)		\n\t"/* __BAr= t10;		__BAi= t11; */\
		"vmovaps		%%zmm12,     (%%r13)		\n\t	vmovaps			%%zmm13,0x040(%%r13)		\n\t"/* __BBr= _e ;		__BBi= _f ; */\
		"\n\t"\
		/*...Block 3: t6,14,22,30 */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
		"vmovaps		     (%%rbx),%%zmm2 		\n\t	vmovaps			0x040(%%rbx),%%zmm3 		\n\t"/*    t14;    t15; */\
		"vbroadcastsd	0x008(%%rsi),%%zmm15		\n\t"/* load __sc  */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			0x040(%%rcx),%%zmm5 		\n\t"/*    t22;    t23; */\
		"vmovaps		     (%%rdx),%%zmm6 		\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*    t30;    t31; */\
		"vaddpd		%%zmm2 ,%%zmm3 ,%%zmm10 		\n\t"/* _c = t14+t15; */\
		"vsubpd		%%zmm2 ,%%zmm3 ,%%zmm11			\n\t"/* _d = t15-t14; */\
		"vbroadcastsd	0x0d0(%%rsi),%%zmm14		\n\t"/* load __c31 */\
		"vmovaps			 %%zmm5 ,%%zmm12 		\n\t	vmovaps				 %%zmm4 ,%%zmm13		\n\t"/* _e = t23; _f = t22;*/\
		" vfmsub231pd	%%zmm4 ,%%zmm15,%%zmm12		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm15,%%zmm13		\n\t"/*	 FMS231(t22,__sc,_e );		 FMA231(t23,__sc,_f );*/\
		"vmovaps			 %%zmm6 ,%%zmm8  		\n\t	vmovaps				 %%zmm7 ,%%zmm9 		\n\t"/* _a = t30; _b = t31; */\
		"vfnmadd231pd	%%zmm7 ,%%zmm15,%%zmm8 		\n\t	 vfmadd231pd	%%zmm6 ,%%zmm15,%%zmm9 		\n\t"/*	FNMA231(t31,__sc,_a );		 FMA231(t30,__sc,_b );*/\
		"vbroadcastsd	0x018(%%rsi),%%zmm6 		\n\t"/* load __c2i2 */\
		"vbroadcastsd	(%%rsi),%%zmm15				\n\t"/* load __c1_c */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			0x040(%%rax),%%zmm1 		\n\t"/*    t06;    t07; */\
		"vmovaps			 %%zmm1 ,%%zmm3 		\n\t	vmovaps				 %%zmm0 ,%%zmm2 		\n\t"/* t15= t07;	t14= t06; */\
		"vfnmadd231pd	%%zmm11,%%zmm6 ,%%zmm1 		\n\t	vfnmadd231pd	%%zmm10,%%zmm6 ,%%zmm0 		\n\t"/*	FNMA231(_d ,__c2i2,t07);	FNMA231(_c ,__c2i2,t06); */\
		"vmovaps			 %%zmm12,%%zmm4 		\n\t	vmovaps				 %%zmm13,%%zmm5 		\n\t"/* t22= _e; t23= _f; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm14,%%zmm4 		\n\t	vfnmadd231pd	%%zmm9 ,%%zmm14,%%zmm5 		\n\t"/*	FNMA231(_a ,__c31 ,t22);	FNMA231(_b ,__c31 ,t23); */\
		" vfmadd231pd	%%zmm8 ,%%zmm14,%%zmm12		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm14,%%zmm13		\n\t"/*	 FMA231(_a ,__c31 ,_e );	 FMA231(_b ,__c31 ,_f ); */\
		" vfmadd231pd	%%zmm10,%%zmm6 ,%%zmm2 		\n\t	 vfmadd231pd	%%zmm11,%%zmm6 ,%%zmm3 		\n\t"/*	 FMA231(_c ,__c2i2,t14);	 FMA231(_d ,__c2i2,t15); */\
		"vmovaps			 %%zmm0 ,%%zmm8 		\n\t	vmovaps				 %%zmm1 ,%%zmm9 		\n\t"/* _a = t06; _b = t07; */\
		" vfmadd231pd	%%zmm4 ,%%zmm15,%%zmm0 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm15,%%zmm1 		\n\t"/*	 FMA231(t22,__c1_c,t06);	 FMA231(t23,__c1_c,t07); */\
		"vfnmadd231pd	%%zmm4 ,%%zmm15,%%zmm8 		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm15,%%zmm9 		\n\t"/*	FNMA231(t22,__c1_c,_a );	FNMA231(t23,__c1_c,_b ); */\
		"vmovaps			 %%zmm2 ,%%zmm10		\n\t	vmovaps				 %%zmm3 ,%%zmm11		\n\t"/* _c = t14; _d = t15; */\
		"vfnmadd231pd	%%zmm13,%%zmm15,%%zmm2 		\n\t	 vfmadd231pd	%%zmm12,%%zmm15,%%zmm3 		\n\t"/*	FNMA231(_f ,__c1_c,t14);	 FMA231(_e ,__c1_c,t15); */\
		" vfmadd231pd	%%zmm13,%%zmm15,%%zmm10		\n\t	vfnmadd231pd	%%zmm12,%%zmm15,%%zmm11		\n\t"/*	 FMA231(_f ,__c1_c,_c );	FNMA231(_e ,__c1_c,_d ); */\
		/* Write outputs: */\
		"movq	%[__outc],%%r10		\n\t"\
		"movq	%[__outd],%%r11		\n\t"\
		"movq	%[__oute],%%r12		\n\t"\
		"movq	%[__outf],%%r13		\n\t"\
		"vmovaps		%%zmm0 ,     (%%r10)		\n\t	vmovaps			%%zmm1 ,0x040(%%r10)		\n\t"/* __BCr= t06;		__BCi= t07; */\
		"vmovaps		%%zmm8 ,     (%%r11)		\n\t	vmovaps			%%zmm9 ,0x040(%%r11)		\n\t"/* __BDr= _a ;		__BDi= _b ; */\
		"vmovaps		%%zmm2 ,     (%%r12)		\n\t	vmovaps			%%zmm3 ,0x040(%%r12)		\n\t"/* __BEr= t14;		__BEi= t15; */\
		"vmovaps		%%zmm10,     (%%r13)		\n\t	vmovaps			%%zmm11,0x040(%%r13)		\n\t"/* __BFr= _c ;		__BFi= _d ; */\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		 ,[__out4] "m" (Xout4)\
		 ,[__out5] "m" (Xout5)\
		 ,[__out6] "m" (Xout6)\
		 ,[__out7] "m" (Xout7)\
		 ,[__out8] "m" (Xout8)\
		 ,[__out9] "m" (Xout9)\
		 ,[__outa] "m" (Xouta)\
		 ,[__outb] "m" (Xoutb)\
		 ,[__outc] "m" (Xoutc)\
		 ,[__outd] "m" (Xoutd)\
		 ,[__oute] "m" (Xoute)\
		 ,[__outf] "m" (Xoutf)\
		 ,[__cc0] "m" (Xcc0)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX16_DIT_FMA_OOP(Xin0,Xi1,Xi2,Xi3,Xi4, Xout0,Xo1,Xo2,Xo3,Xo4, Xcc0)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i2](%%rax),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i3](%%rax),%%rdx	\n\t"/* __in0 + 3*istride */\
		"movq	%[__cc0],%%rsi 			\n\t"\
		"vbroadcastsd 0x28(%%rsi),%%zmm13 \n\t vbroadcastsd 0x38(%%rsi),%%zmm14 \n\t vbroadcastsd 0x48(%%rsi),%%zmm15 \n\t"/* load __r8,r4,rC into zmm13-15 */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			0x040(%%rcx),%%zmm5 		\n\t"/*	t04 =__A8r;					t05 =__A8i; */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			0x040(%%rax),%%zmm1 		\n\t"/*	t00 =__A0r;					t01 =__A0i; */\
		"vmovaps		%%zmm4,%%zmm6				\n\t"/*	t06 = t04; */\
		"vfmadd231pd	%%zmm5 ,%%zmm13,%%zmm4 		\n\t	vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm5 		\n\t"/*	FNMA231(  t05,__r8,t04);	 FMA231(  t06,__r8,t05); */\
		"vmovaps		     (%%rbx),%%zmm8			\n\t	vmovaps			0x040(%%rbx),%%zmm9 		\n\t"/*	_a =__A4r;					_b =__A4i; */\
		"vfmadd231pd	0x040(%%rbx),%%zmm14,%%zmm8 \n\t	vfnmadd231pd	     (%%rbx),%%zmm14,%%zmm9 \n\t"/*	FNMA231(__A4i,__r4,_a );	 FMA231(__A4r,__r4,_b ); */\
		"vbroadcastsd	0x040(%%rsi),%%zmm13		\n\t	vbroadcastsd	0x020(%%rsi),%%zmm14		\n\t"/* load __cC4,c8 into pair of regs */\
		"vmovaps		     (%%rdx),%%zmm6			\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*	t06 =__ACr;					t07 =__ACi; */\
		"vfmadd231pd	0x040(%%rdx),%%zmm15,%%zmm6 \n\t	vfnmadd231pd	     (%%rdx),%%zmm15,%%zmm7 \n\t"/*	FNMA231(__ACi,__rC,t06);	 FMA231(__ACr,__rC,t07); */\
		"vbroadcastsd	0x030(%%rsi),%%zmm15		\n\t"/* load __c4 */\
		"vmovaps		%%zmm8 ,%%zmm10				\n\t	vmovaps			%%zmm0,%%zmm2 				\n\t"/*	_c = _a;	t02 = t00; */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm8 		\n\t	 vfmadd231pd	%%zmm4 ,%%zmm14,%%zmm0 		\n\t"/*	 FMA231(t06,__cC4,_a);		 FMA231(t04,__c8,t00); */\
		"vmovaps		%%zmm9 ,%%zmm11				\n\t	vmovaps			%%zmm1 ,%%zmm3 				\n\t"/*	_d = _b;	t03 = t01; */\
		" vfmadd231pd	%%zmm7 ,%%zmm13,%%zmm9 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm14,%%zmm1 		\n\t"/*	 FMA231(t07,__cC4,_b);		 FMA231(t05,__c8,t01); */\
		"vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm10		\n\t	vfnmadd231pd	%%zmm4 ,%%zmm14,%%zmm2 		\n\t"/*	FNMA231(t06,__cC4,_c);		FNMA231(t04,__c8,t02); */\
		"vfnmadd231pd	%%zmm7 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm14,%%zmm3 		\n\t"/*	FNMA231(t07,__cC4,_d);		FNMA231(t05,__c8,t03); */\
		"vmovaps		%%zmm0 ,%%zmm4 				\n\t	vmovaps			%%zmm1 ,%%zmm5 				\n\t"/*	t04 =t00; t05 =t01; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm15,%%zmm4 		\n\t	 vfmadd231pd	%%zmm8 ,%%zmm15,%%zmm0 		\n\t"/*	FNMA231(_a ,__c4 ,t04);		 FMA231(_a ,__c4 ,t00); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm15,%%zmm5 		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm15,%%zmm1 		\n\t"/*	FNMA231(_b ,__c4 ,t05);		 FMA231(_b ,__c4 ,t01); */\
		"vmovaps		%%zmm2 ,%%zmm6 				\n\t	vmovaps			%%zmm3 ,%%zmm7 				\n\t"/*	t06 =t02;	t07 =t03; */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm6 		\n\t"/*	 FMA231(_d ,__c4 ,t06);		FNMA231(_d ,__c4 ,t02); */\
		"vfnmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm7 		\n\t"/*	FNMA231(_c ,__c4 ,t07);		 FMA231(_c ,__c4 ,t03); */\
		"vmovaps		%%zmm4 ,     (%%rcx)		\n\t	vmovaps			%%zmm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%zmm5 ,0x040(%%rcx)		\n\t	vmovaps			%%zmm1 ,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm6 ,     (%%rdx)		\n\t	vmovaps			%%zmm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%zmm7 ,0x040(%%rdx)		\n\t	vmovaps			%%zmm3 ,0x040(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%zmm12 	\n\t"/* load __r2 into zmm12 */\
		"vbroadcastsd 0x028(%%rsi),%%zmm13 	\n\t"/* load __rA into zmm13 */\
		"vbroadcastsd 0x038(%%rsi),%%zmm14 	\n\t"/* load __r6 into zmm14 */\
		"vbroadcastsd 0x048(%%rsi),%%zmm15 	\n\t"/* load __rE into zmm15 */\
		"vmovaps (%%rax),%%zmm0  \n\t vmovaps 0x040(%%rax),%%zmm1  \n\t"/* t08 =__A2r; t09 =__A2i; */\
		"vmovaps (%%rcx),%%zmm4  \n\t vmovaps 0x040(%%rcx),%%zmm5  \n\t"/* t12 =__AAr; t13 =__AAi; */\
		"vmovaps (%%rbx),%%zmm8  \n\t vmovaps 0x040(%%rbx),%%zmm9  \n\t"/* _a  =__A6r; _b  =__A6i; */\
		"vmovaps (%%rdx),%%zmm6  \n\t vmovaps 0x040(%%rdx),%%zmm7  \n\t"/* t14 =__AEr; t15 =__AEi; */\
		"vfmadd231pd	0x040(%%rax),%%zmm12,%%zmm0 \n\t	vfnmadd231pd	(%%rax),%%zmm12,%%zmm1  	\n\t"/* FNMA231(__A2i,__r2,t08); FMA231(__A2r,__r2,t09); */\
		"vfmadd231pd	0x040(%%rcx),%%zmm13,%%zmm4 \n\t	vfnmadd231pd	(%%rcx),%%zmm13,%%zmm5  	\n\t"/* FNMA231(__AAi,__rA,t12); FMA231(__AAr,__rA,t13); */\
		"vbroadcastsd	0x040(%%rsi),%%zmm13		\n\t"/* load __cE6 */\
		"vfmadd231pd	0x040(%%rbx),%%zmm14,%%zmm8 \n\t	vfnmadd231pd	(%%rbx),%%zmm14,%%zmm9  	\n\t"/* FNMA231(__A6i,__r6,_a ); FMA231(__A6r,__r6,_b ); */\
		"vbroadcastsd	0x020(%%rsi),%%zmm14		\n\t"/* load __cA2 */\
		"vfmadd231pd	0x040(%%rdx),%%zmm15,%%zmm6 \n\t	vfnmadd231pd	(%%rdx),%%zmm15,%%zmm7  	\n\t"/* FNMA231(__AEi,__rE,t14); FMA231(__AEr,__rE,t15); */\
		"vbroadcastsd	0x030(%%rsi),%%zmm15		\n\t"/* load __c62 */\
		"vmovaps		%%zmm8 ,%%zmm10				\n\t	vmovaps			%%zmm0,%%zmm2 				\n\t"/*	_c = _a;	t10 = t08; */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm8 		\n\t	 vfmadd231pd	%%zmm4 ,%%zmm14,%%zmm0 		\n\t"/*	 FMA231(t14,__cE6,_a);		 FMA231(t12,__cA2,t08); */\
		"vmovaps		%%zmm9 ,%%zmm11				\n\t	vmovaps			%%zmm1 ,%%zmm3 				\n\t"/*	_d = _b;	t11 = t09; */\
		" vfmadd231pd	%%zmm7 ,%%zmm13,%%zmm9 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm14,%%zmm1 		\n\t"/*	 FMA231(t15,__cE6,_b);		 FMA231(t13,__cA2,t09); */\
		"vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm10		\n\t	vfnmadd231pd	%%zmm4 ,%%zmm14,%%zmm2 		\n\t"/*	FNMA231(t14,__cE6,_c);		FNMA231(t12,__cA2,t10); */\
		"vfnmadd231pd	%%zmm7 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm14,%%zmm3 		\n\t"/*	FNMA231(t15,__cE6,_d);		FNMA231(t13,__cA2,t11); */\
		"vmovaps		%%zmm0 ,%%zmm4 				\n\t	vmovaps			%%zmm1 ,%%zmm5 				\n\t"/*	t12 =t08 ;	t13 =t09; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm15,%%zmm4 		\n\t	 vfmadd231pd	%%zmm8 ,%%zmm15,%%zmm0 		\n\t"/*	FNMA231(_a,__c62,t12);		 FMA231( _a,__c62,t08); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm15,%%zmm5 		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm15,%%zmm1 		\n\t"/*	FNMA231(_b,__c62,t13);		 FMA231( _b,__c62,t09); */\
		"vmovaps		%%zmm2 ,%%zmm6 				\n\t	vmovaps			%%zmm3 ,%%zmm7 				\n\t"/*	t14 =t10;	t15 =t11; */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm6 		\n\t"/*	 FMA231(_d,__c62,t14);		FNMA231( _d,__c62,t10); */\
		"vfnmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm7 		\n\t"/*	FNMA231(_c,__c62,t15);		 FMA231( _c,__c62,t11); */\
		"vmovaps		%%zmm4 ,     (%%rcx)		\n\t	vmovaps			%%zmm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%zmm5 ,0x040(%%rcx)		\n\t	vmovaps			%%zmm1 ,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm6 ,     (%%rdx)		\n\t	vmovaps			%%zmm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%zmm7 ,0x040(%%rdx)		\n\t	vmovaps			%%zmm3 ,0x040(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%zmm12 	\n\t"/* load __r1 into zmm12 */\
		"vbroadcastsd 0x028(%%rsi),%%zmm13 	\n\t"/* load __r9 into zmm13 */\
		"vbroadcastsd 0x038(%%rsi),%%zmm14 	\n\t"/* load __r5 into zmm14 */\
		"vbroadcastsd 0x048(%%rsi),%%zmm15 	\n\t"/* load __rD into zmm15 */\
		"vmovaps (%%rax),%%zmm0  \n\t vmovaps 0x040(%%rax),%%zmm1  \n\t"/* t16 =__A1r;	t17 =__A1i; */\
		"vmovaps (%%rcx),%%zmm4  \n\t vmovaps 0x040(%%rcx),%%zmm5  \n\t"/* t20 =__A9r;	t21 =__A9i; */\
		"vmovaps (%%rbx),%%zmm8  \n\t vmovaps 0x040(%%rbx),%%zmm9  \n\t"/* _a=  __A5r;	_b  =__A5i; */\
		"vmovaps (%%rdx),%%zmm6  \n\t vmovaps 0x040(%%rdx),%%zmm7  \n\t"/* t22 =__ADr;	t23 =__ADi; */\
		"vfmadd231pd	0x040(%%rax),%%zmm12,%%zmm0 \n\t	vfnmadd231pd	(%%rax),%%zmm12,%%zmm1  	\n\t"/* FNMA231(__A1i,__r1,t16);	 FMA231(__A1r,__r1,t17); */\
		"vfmadd231pd	0x040(%%rcx),%%zmm13,%%zmm4 \n\t	vfnmadd231pd	(%%rcx),%%zmm13,%%zmm5  	\n\t"/* FNMA231(__A9i,__r9,t20);	 FMA231(__A9r,__r9,t21); */\
		"vbroadcastsd	0x040(%%rsi),%%zmm13		\n\t"/* load __cD5 */\
		"vfmadd231pd	0x040(%%rbx),%%zmm14,%%zmm8 \n\t	vfnmadd231pd	(%%rbx),%%zmm14,%%zmm9  	\n\t"/* FNMA231(__A5i,__r5,_a );	 FMA231(__A5r,__r5,_b ); */\
		"vbroadcastsd	0x020(%%rsi),%%zmm14		\n\t"/* load __c91 */\
		"vfmadd231pd	0x040(%%rdx),%%zmm15,%%zmm6 \n\t	vfnmadd231pd	(%%rdx),%%zmm15,%%zmm7  	\n\t"/* FNMA231(__ADi,__rD,t22);	 FMA231(__ADr,__rD,t23); */\
		"vbroadcastsd	0x030(%%rsi),%%zmm15		\n\t"/* load __c51 */\
		"vmovaps		%%zmm8 ,%%zmm10				\n\t	vmovaps			%%zmm0,%%zmm2 				\n\t"/*	_c= _a;	t18= t16; */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm8 		\n\t	 vfmadd231pd	%%zmm4 ,%%zmm14,%%zmm0 		\n\t"/*	 FMA231(t22,__cD5,_a);		 FMA231(t20,__c91,t16); */\
		"vmovaps		%%zmm9 ,%%zmm11				\n\t	vmovaps			%%zmm1 ,%%zmm3 				\n\t"/*	_d= _b;	t19= t17; */\
		" vfmadd231pd	%%zmm7 ,%%zmm13,%%zmm9 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm14,%%zmm1 		\n\t"/*	 FMA231(t23,__cD5,_b);		 FMA231(t21,__c91,t17); */\
		"vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm10		\n\t	vfnmadd231pd	%%zmm4 ,%%zmm14,%%zmm2 		\n\t"/*	FNMA231(t22,__cD5,_c);		FNMA231(t20,__c91,t18); */\
		"vfnmadd231pd	%%zmm7 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm14,%%zmm3 		\n\t"/*	FNMA231(t23,__cD5,_d);		FNMA231(t21,__c91,t19); */\
		"vmovaps		%%zmm0 ,%%zmm4 				\n\t	vmovaps			%%zmm1 ,%%zmm5 				\n\t"/*	t20 =t16;	t21 =t17; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm15,%%zmm4 		\n\t	 vfmadd231pd	%%zmm8 ,%%zmm15,%%zmm0 		\n\t"/*	FNMA231(_a,__c51,t20);		 FMA231(_a,__c51,t16); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm15,%%zmm5 		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm15,%%zmm1 		\n\t"/*	FNMA231(_b,__c51,t21);		 FMA231(_b,__c51,t17); */\
		"vmovaps		%%zmm2 ,%%zmm6 				\n\t	vmovaps			%%zmm3 ,%%zmm7 				\n\t"/*	t22 =t18;	t23 =t19; */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm6 		\n\t"/*	 FMA231(_d,__c51,t22);		FNMA231(_d,__c51,t18); */\
		"vfnmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm7 		\n\t"/*	FNMA231(_c,__c51,t23);		 FMA231(_c,__c51,t19); */\
		"vmovaps		%%zmm4 ,     (%%rcx)		\n\t	vmovaps			%%zmm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%zmm5 ,0x040(%%rcx)		\n\t	vmovaps			%%zmm1 ,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm6 ,     (%%rdx)		\n\t	vmovaps			%%zmm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%zmm7 ,0x040(%%rdx)		\n\t	vmovaps			%%zmm3 ,0x040(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%zmm12 	\n\t"/* load __r3 into zmm12 */\
		"vbroadcastsd 0x028(%%rsi),%%zmm13 	\n\t"/* load __rB into zmm13 */\
		"vbroadcastsd 0x038(%%rsi),%%zmm14 	\n\t"/* load __r7 into zmm14 */\
		"vbroadcastsd 0x048(%%rsi),%%zmm15 	\n\t"/* load __rF into zmm15 */\
		"vmovaps (%%rax),%%zmm0  \n\t vmovaps 0x040(%%rax),%%zmm1  \n\t"/* t24 =__A3r;	t25 =__A3i; */\
		"vmovaps (%%rcx),%%zmm4  \n\t vmovaps 0x040(%%rcx),%%zmm5  \n\t"/* t28 =__ABr;	t29 =__ABi; */\
		"vmovaps (%%rbx),%%zmm8  \n\t vmovaps 0x040(%%rbx),%%zmm9  \n\t"/* _a  =__A7r;	_b  =__A7i; */\
		"vmovaps (%%rdx),%%zmm6  \n\t vmovaps 0x040(%%rdx),%%zmm7  \n\t"/* t30 =__AFr;	t31 =__AFi; */\
		"vfmadd231pd	0x040(%%rax),%%zmm12,%%zmm0 \n\t	vfnmadd231pd	(%%rax),%%zmm12,%%zmm1  	\n\t"/* FNMA231(__A3i,__r3,t24);	 FMA231(__A3r,__r3,t25); */\
		"vfmadd231pd	0x040(%%rcx),%%zmm13,%%zmm4 \n\t	vfnmadd231pd	(%%rcx),%%zmm13,%%zmm5  	\n\t"/* FNMA231(__ABi,__rB,t28 );	 FMA231(__ABr,__rB,t29 ); */\
		"vbroadcastsd	0x040(%%rsi),%%zmm13		\n\t"/* load __cF7 */\
		"vfmadd231pd	0x040(%%rbx),%%zmm14,%%zmm8 \n\t	vfnmadd231pd	(%%rbx),%%zmm14,%%zmm9  	\n\t"/* FNMA231(__A7i,__r7,_a);		 FMA231(__A7r,__r7,_b); */\
		"vbroadcastsd	0x020(%%rsi),%%zmm14		\n\t"/* load __cB3 */\
		"vfmadd231pd	0x040(%%rdx),%%zmm15,%%zmm6 \n\t	vfnmadd231pd	(%%rdx),%%zmm15,%%zmm7  	\n\t"/* FNMA231(__AFi,__rF,t30 );	 FMA231(__AFr,__rF,t31 ); */\
		"vbroadcastsd	0x030(%%rsi),%%zmm15		\n\t"/* load __c73 */\
		"vmovaps		%%zmm8 ,%%zmm10				\n\t	vmovaps			%%zmm0,%%zmm2 				\n\t"/*	_c= _a;	t26= t24; */\
		" vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm8 		\n\t	 vfmadd231pd	%%zmm4 ,%%zmm14,%%zmm0 		\n\t"/*	 FMA231(t30,__cF7,_a);		 FMA231(t28,__cB3,t24); */\
		"vmovaps		%%zmm9 ,%%zmm11				\n\t	vmovaps			%%zmm1 ,%%zmm3 				\n\t"/*	_d= _b;	t27= t25; */\
		" vfmadd231pd	%%zmm7 ,%%zmm13,%%zmm9 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm14,%%zmm1 		\n\t"/*	 FMA231(t31,__cF7,_b);		 FMA231(t29,__cB3,t25); */\
		"vfnmadd231pd	%%zmm6 ,%%zmm13,%%zmm10		\n\t	vfnmadd231pd	%%zmm4 ,%%zmm14,%%zmm2 		\n\t"/*	FNMA231(t30,__cF7,_c);		FNMA231(t28,__cB3,t26); */\
		"vfnmadd231pd	%%zmm7 ,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm14,%%zmm3 		\n\t"/*	FNMA231(t31,__cF7,_d);		FNMA231(t29,__cB3,t27); */\
		"vmovaps		%%zmm0 ,%%zmm4 				\n\t	vmovaps			%%zmm1 ,%%zmm5 				\n\t"/*	t28 =t24;	t29 =t25; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm15,%%zmm4 		\n\t	 vfmadd231pd	%%zmm8 ,%%zmm15,%%zmm0 		\n\t"/*	FNMA231(_a,__c73,t28);		 FMA231(_a,__c73,t24); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm15,%%zmm5 		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm15,%%zmm1 		\n\t"/*	FNMA231(_b,__c73,t29);		 FMA231(_b,__c73,t25); */\
		"vmovaps		%%zmm2 ,%%zmm6 				\n\t	vmovaps			%%zmm3 ,%%zmm7 				\n\t"/*	t30 =t26;	t31 =t27; */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm6 		\n\t"/*	 FMA231(_d,__c73,t30);		FNMA231(_d,__c73,t26); */\
		"vfnmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm7 		\n\t"/*	FNMA231(_c,__c73,t31);		 FMA231(_c,__c73,t27); */\
		"vmovaps		%%zmm4 ,     (%%rcx)		\n\t	vmovaps			%%zmm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%zmm5 ,0x040(%%rcx)		\n\t	vmovaps			%%zmm1 ,0x040(%%rax)		\n\t"\
		"vmovaps		%%zmm6 ,     (%%rdx)		\n\t	vmovaps			%%zmm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%zmm7 ,0x040(%%rdx)		\n\t	vmovaps			%%zmm3 ,0x040(%%rbx)		\n\t"\
		"\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"subq	$0x0c0,%%rsi 		/* revert cc-ptr to base value */\n\t"\
		"\n\t"\
		/*...Read t0,8,16,24 from local store ... Do the 4 Im-part FMAs first, because their results needed 1st below */\
		"vbroadcastsd	0x050(%%rsi),%%zmm14		\n\t	vbroadcastsd	0x0d0(%%rsi),%%zmm15		\n\t"/* load __c2,c31 into pair of regs */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			     (%%rbx),%%zmm2 		\n\t"/*    t00;    t08; */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			     (%%rdx),%%zmm6 		\n\t"/*    t16;    t24; */\
		"vmovaps			 %%zmm0 ,%%zmm8 		\n\t	vmovaps				 %%zmm4 ,%%zmm10		\n\t"/* _a=t00; _c=t16; */\
		" vfmadd231pd	%%zmm2 ,%%zmm14,%%zmm0 		\n\t	 vfmadd231pd	%%zmm6 ,%%zmm15,%%zmm4 		\n\t"/*	 FMA231(t08,__c2 ,t00);		 FMA231(t24,__c31,t16); */\
		"vmovaps		0x040(%%rax),%%zmm1 		\n\t	vmovaps			0x040(%%rbx),%%zmm3 		\n\t"/*    t01;    t09; */\
		"vmovaps		0x040(%%rcx),%%zmm5 		\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*    t17;    t25; */\
		"vmovaps			 %%zmm1 ,%%zmm9 		\n\t	vmovaps				 %%zmm5 ,%%zmm11		\n\t"/* _b=t01; _d=t17; */\
		" vfmadd231pd	%%zmm3 ,%%zmm14,%%zmm1 		\n\t	 vfmadd231pd	%%zmm7 ,%%zmm15,%%zmm5 		\n\t"/*	 FMA231(t09,__c2 ,t01);		 FMA231(t25,__c31,t17); */\
		"vfnmadd231pd	%%zmm2 ,%%zmm14,%%zmm8 		\n\t	vfnmadd231pd	%%zmm6 ,%%zmm15,%%zmm10		\n\t"/*	FNMA231(t08,__c2 ,_a );		FNMA231(t24,__c31,_c ); */\
		"vbroadcastsd	0x090(%%rsi),%%zmm6 		\n\t"/* load __c1 */\
		"vfnmadd231pd	%%zmm3 ,%%zmm14,%%zmm9 		\n\t	vfnmadd231pd	%%zmm7 ,%%zmm15,%%zmm11		\n\t"/*	FNMA231(t09,__c2 ,_b );		FNMA231(t25,__c31,_d ); */\
		"vmovaps			 %%zmm0 ,%%zmm12		\n\t	vmovaps				 %%zmm1 ,%%zmm13		\n\t"/* _e = t00; _f = t01; */\
		" vfmadd231pd	%%zmm4 ,%%zmm6 ,%%zmm0 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm6 ,%%zmm1 		\n\t"/*	 FMA231(t16,__c1 ,t00);		 FMA231(t17,__c1 ,t01); */\
		"vfnmadd231pd	%%zmm4 ,%%zmm6 ,%%zmm12		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm6 ,%%zmm13		\n\t"/*	FNMA231(t16,__c1 ,_e );		FNMA231(t17,__c1 ,_f ); */\
		"vmovaps			 %%zmm8 ,%%zmm2 		\n\t	vmovaps				 %%zmm9 ,%%zmm3 		\n\t"/* t08 = _a ; t09 = _b; */\
		"vfnmadd231pd	%%zmm11,%%zmm6 ,%%zmm2 		\n\t	 vfmadd231pd	%%zmm10,%%zmm6 ,%%zmm3 		\n\t"/*	FNMA231(_d ,__c1 ,t08);		 FMA231(_c ,__c1 ,t09); */\
		" vfmadd231pd	%%zmm11,%%zmm6 ,%%zmm8 		\n\t	vfnmadd231pd	%%zmm10,%%zmm6 ,%%zmm9 		\n\t"/*	 FMA231(_d ,__c1 ,_a );		FNMA231(_c ,__c1 ,_b ); */\
		/* Write outputs - Swap 4/C outputs for DIT */\
		"movq	%[__out0],%%r10		\n\t"\
		"leaq	%c[__o4](%%r10),%%r12	\n\t"/* __out0 +   [4*ostride] */\
		"leaq	%c[__o4](%%r12),%%r11	\n\t"/* __out0 + 2*[4*ostride] */\
		"leaq	%c[__o4](%%r11),%%r13	\n\t"/* __out0 + 3*[4*ostride] */\
		"vmovaps		%%zmm0 ,     (%%r10)		\n\t	vmovaps			%%zmm1 ,0x040(%%r10)		\n\t"/* __B0r= t00;		__B0i= t01; */\
		"vmovaps		%%zmm12,     (%%r11)		\n\t	vmovaps			%%zmm13,0x040(%%r11)		\n\t"/* __B8r= _e ;		__B8i= _f ; */\
		"vmovaps		%%zmm2 ,     (%%r13)		\n\t	vmovaps			%%zmm3 ,0x040(%%r13)		\n\t"/* __Bcr= t08;		__Bci= t09; */\
		"vmovaps		%%zmm8 ,     (%%r12)		\n\t	vmovaps			%%zmm9 ,0x040(%%r12)		\n\t"/* __B4r= _a ;		__B4i= _b ; */\
		"\n\t"\
		/*...Block 2: t4,12,20,28 */\
		"vbroadcastsd	0x110(%%rsi),%%zmm13	\n\t"/* cc0 + 0x22 = __two; Actually holds 1.0 in AVX2 mode */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
		"vbroadcastsd	0x050(%%rsi),%%zmm14		\n\t	vbroadcastsd	0x0d0(%%rsi),%%zmm15		\n\t"/* load __c2,31 into pair of regs */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			0x040(%%rcx),%%zmm5 		\n\t"/*    t20;    t21; */\
		"vmovaps		     (%%rdx),%%zmm6 		\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*    t28;    t29; */\
		"vmovaps			 %%zmm4 ,%%zmm10 		\n\t	vmovaps				 %%zmm7 ,%%zmm11		\n\t"/* _c=t20; _d=t29; */\
		" vfmadd231pd	%%zmm5 ,%%zmm13,%%zmm10		\n\t	 vfmsub231pd	%%zmm5 ,%%zmm13,%%zmm4 		\n\t"/*	FNMA231(t21,1.0,_c );		 FMA231(t21,1.0,t20); */\
		" vfmsub231pd	%%zmm6 ,%%zmm13,%%zmm11		\n\t	 vfmadd231pd	%%zmm6 ,%%zmm13,%%zmm7 		\n\t"/*	 FMA231(t28,1.0,_d );		FNMA231(t28,1.0,t29); */\
		"vbroadcastsd	0x010(%%rsi),%%zmm13		\n\t"/* load __c1i2 */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			0x040(%%rax),%%zmm1 		\n\t"/*    t04;    t05; */\
		"vmovaps		     (%%rbx),%%zmm2 		\n\t	vmovaps			0x040(%%rbx),%%zmm3 		\n\t"/*    t12;    t13; */\
		"vmovaps			 %%zmm0 ,%%zmm8 		\n\t	vmovaps				 %%zmm1 ,%%zmm9 		\n\t"/* _a = t04; _b = t05; */\
		"vmovaps			 %%zmm10,%%zmm5 		\n\t	vmovaps				 %%zmm4 ,%%zmm12		\n\t"/* t21 = _c; _e = t20; */\
		"vfnmadd231pd	%%zmm3 ,%%zmm14,%%zmm8 		\n\t	 vfmadd231pd	%%zmm11,%%zmm15,%%zmm5 		\n\t"/*	 FMA231(t13,__c2 ,_a );		 FMA231(_d ,__c31,t21); */\
		" vfmadd231pd	%%zmm2 ,%%zmm14,%%zmm9 		\n\t	 vfmadd231pd	%%zmm7 ,%%zmm15,%%zmm4 		\n\t"/*	FNMA231(t12,__c2 ,_b );		 FMA231(t29,__c31,t20); */\
		" vfmadd231pd	%%zmm3 ,%%zmm14,%%zmm0 		\n\t	vfnmadd231pd	%%zmm11,%%zmm15,%%zmm10		\n\t"/*	FNMA231(t13,__c2 ,t04);		FNMA231(_d ,__c31,_c ); */\
		"vfnmadd231pd	%%zmm2 ,%%zmm14,%%zmm1 		\n\t	vfnmadd231pd	%%zmm7 ,%%zmm15,%%zmm12		\n\t"/*	 FMA231(t12,__c2 ,t05);		FNMA231(t29,__c31,_e ); */\
		"vmovaps			 %%zmm8 ,%%zmm2 		\n\t	vmovaps				 %%zmm9 ,%%zmm3 		\n\t"/* t12 = _a; t13 = _b; */\
		"vfnmadd231pd	%%zmm4 ,%%zmm13,%%zmm2 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm13,%%zmm3 		\n\t"/*	FNMA231(t20,__c1i2,t12);	 FMA231(t21,__c1i2,t13); */\
		" vfmadd231pd	%%zmm4 ,%%zmm13,%%zmm8 		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm13,%%zmm9 		\n\t"/*	 FMA231(t20,__c1i2,_a );	FNMA231(t21,__c1i2,_b ); */\
		"vmovaps			 %%zmm0 ,%%zmm11		\n\t	vmovaps				 %%zmm1 ,%%zmm7 		\n\t"/* _d = t04; t29 = t05; */\
		" vfmadd231pd	%%zmm10,%%zmm13,%%zmm0 		\n\t	 vfmadd231pd	%%zmm12,%%zmm13,%%zmm1 		\n\t"/*	 FMA231(_c ,__c1i2,t04);	 FMA231(_e ,__c1i2,t05); */\
		"vfnmadd231pd	%%zmm10,%%zmm13,%%zmm11		\n\t	vfnmadd231pd	%%zmm12,%%zmm13,%%zmm7 		\n\t"/*	FNMA231(_c ,__c1i2,_d );	FNMA231(_e ,__c1i2,t29); */\
		/* Write outputs - Not sure why, but need apply 6/E swap here *and* then pairwise swap, i.e. 2A[6][E] => [2A][E6] => E62A: */\
		"addq	$%c[__o2],%%r10	\n\t"/* __out0 + 2*ostride */\
		"addq	$%c[__o2],%%r12	\n\t"/* __out0 + 6*ostride */\
		"addq	$%c[__o2],%%r11	\n\t"/* __out0 + a*ostride */\
		"addq	$%c[__o2],%%r13	\n\t"/* __out0 + e*ostride */\
		"vmovaps		%%zmm2 ,     (%%r13)		\n\t	vmovaps			%%zmm3 ,0x040(%%r13)		\n\t"/* __BEr= t12;		__BEi= t13; */\
		"vmovaps		%%zmm8 ,     (%%r12)		\n\t	vmovaps			%%zmm9 ,0x040(%%r12)		\n\t"/* __B6r= _a ;		__B6i= _b ; */\
		"vmovaps		%%zmm0 ,     (%%r10)		\n\t	vmovaps			%%zmm1 ,0x040(%%r10)		\n\t"/* __B2r= t04;		__B2i= t05; */\
		"vmovaps		%%zmm11,     (%%r11)		\n\t	vmovaps			%%zmm7 ,0x040(%%r11)		\n\t"/* __BAr= _d ;		__BAi= t29; */\
		"\n\t"\
		/*...Block 1: t2,10,18,26 */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
		"vmovaps		     (%%rbx),%%zmm2 		\n\t	vmovaps			0x040(%%rbx),%%zmm3 		\n\t"/*    t10;    t11; */\
		"vbroadcastsd	0x008(%%rsi),%%zmm15		\n\t"/* load __sc  */\
		"vbroadcastsd	0x0d0(%%rsi),%%zmm14		\n\t"/* load __c31 */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			0x040(%%rcx),%%zmm5 		\n\t"/*    t18;    t19; */\
		"vmovaps		     (%%rdx),%%zmm6 		\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*    t26;    t27; */\
		"vaddpd		%%zmm3 ,%%zmm2 ,%%zmm12 		\n\t"/* _e = t11+t10; */\
		"vsubpd		%%zmm2 ,%%zmm3 ,%%zmm13			\n\t"/* _f = t11-t10; */\
		"vmovaps			 %%zmm4 ,%%zmm10 		\n\t	vmovaps				 %%zmm7 ,%%zmm8 		\n\t"/* _c = t18; _a = t27; */\
		" vfmadd231pd	%%zmm5 ,%%zmm15,%%zmm10		\n\t	 vfmadd231pd	%%zmm6 ,%%zmm15,%%zmm8 		\n\t"/*	FNMA231(t19,__sc,_c );		 FMS231(t26,__sc,_a ); */\
		"vmovaps			 %%zmm5 ,%%zmm11 		\n\t	vmovaps				 %%zmm6 ,%%zmm9 		\n\t"/* _d = t19; _b = t26; */\
		"vbroadcastsd	0x018(%%rsi),%%zmm6 		\n\t"/* load __c2i2 */\
		"vfnmadd231pd	%%zmm4 ,%%zmm15,%%zmm11		\n\t	 vfmsub231pd	%%zmm7 ,%%zmm15,%%zmm9 		\n\t"/*	 FMA231(t18,__sc,_d );		 FMA231(t27,__sc,_b ); */\
		"vbroadcastsd	(%%rsi),%%zmm15				\n\t"/* load __c1_c */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			0x040(%%rax),%%zmm1 		\n\t"/*    t02;    t03; */\
		"vmovaps			 %%zmm10,%%zmm4 		\n\t	vmovaps				 %%zmm0 ,%%zmm2 		\n\t"/* t18 = _c;	t10 = t02; */\
		" vfmadd231pd	%%zmm8 ,%%zmm14,%%zmm4 		\n\t	 vfmadd231pd	%%zmm12,%%zmm6 ,%%zmm0 		\n\t"/*	 FMA231(_a ,__c31,t18);		 FMA231(_e ,__c2i2,t02); */\
		"vmovaps			 %%zmm11,%%zmm5 		\n\t	vmovaps				 %%zmm1 ,%%zmm3 		\n\t"/* t19 = _d;	t11 = t03; */\
		" vfmadd231pd	%%zmm9 ,%%zmm14,%%zmm5 		\n\t	 vfmadd231pd	%%zmm13,%%zmm6 ,%%zmm1 		\n\t"/*	 FMA231(_b ,__c31,t19);		 FMA231(_f ,__c2i2,t03); */\
		"vfnmadd231pd	%%zmm8 ,%%zmm14,%%zmm10		\n\t	vfnmadd231pd	%%zmm12,%%zmm6 ,%%zmm2 		\n\t"/*	FNMA231(_a ,__c31,_c );		FNMA231(_e ,__c2i2,t10); */\
		"vfnmadd231pd	%%zmm9 ,%%zmm14,%%zmm11		\n\t	vfnmadd231pd	%%zmm13,%%zmm6 ,%%zmm3 		\n\t"/*	FNMA231(_b ,__c31,_d );		FNMA231(_f ,__c2i2,t11); */\
		"vmovaps			 %%zmm0 ,%%zmm8 		\n\t	vmovaps				 %%zmm1 ,%%zmm9 		\n\t"/* _a = t02; _b = t03; */\
		" vfmadd231pd	%%zmm4 ,%%zmm15,%%zmm0 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm15,%%zmm1 		\n\t"/*	 FMA231(t18,__c1_c,t02);	 FMA231(t19,__c1_c,t03); */\
		"vfnmadd231pd	%%zmm4 ,%%zmm15,%%zmm8 		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm15,%%zmm9 		\n\t"/*	FNMA231(t18,__c1_c,_a );	FNMA231(t19,__c1_c,_b ); */\
		"vmovaps			 %%zmm2 ,%%zmm12		\n\t	vmovaps				 %%zmm3 ,%%zmm13		\n\t"/* _e = t10; _f = t11; */\
		"vfnmadd231pd	%%zmm11,%%zmm15,%%zmm2 		\n\t	 vfmadd231pd	%%zmm10,%%zmm15,%%zmm3 		\n\t"/*	FNMA231(_d ,__c1_c,t10);	 FMA231(_c ,__c1_c,t11); */\
		" vfmadd231pd	%%zmm11,%%zmm15,%%zmm12		\n\t	vfnmadd231pd	%%zmm10,%%zmm15,%%zmm13		\n\t"/*	 FMA231(_d ,__c1_c,_e );	FNMA231(_c ,__c1_c,_f ); */\
		/* Write outputs: Swap 5/D outputs for DIT */\
		"subq	$%c[__o1],%%r10	\n\t"/* __out0 + 1*ostride */\
		"subq	$%c[__o1],%%r12	\n\t"/* __out0 + 5*ostride */\
		"subq	$%c[__o1],%%r11	\n\t"/* __out0 + 9*ostride */\
		"subq	$%c[__o1],%%r13	\n\t"/* __out0 + d*ostride */\
		"vmovaps		%%zmm0 ,     (%%r10)		\n\t	vmovaps			%%zmm1 ,0x040(%%r10)		\n\t"/* __B1r= t02;		__B1i= t03; */\
		"vmovaps		%%zmm8 ,     (%%r11)		\n\t	vmovaps			%%zmm9 ,0x040(%%r11)		\n\t"/* __B9r= _a ;		__B9i= _b ; */\
		"vmovaps		%%zmm2 ,     (%%r13)		\n\t	vmovaps			%%zmm3 ,0x040(%%r13)		\n\t"/* __BDr= t10;		__BDi= t11; */\
		"vmovaps		%%zmm12,     (%%r12)		\n\t	vmovaps			%%zmm13,0x040(%%r12)		\n\t"/* __B5r= _e ;		__B5i= _f ; */\
		"\n\t"\
		/*...Block 3: t6,14,22,30 */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
		"vmovaps		     (%%rbx),%%zmm2 		\n\t	vmovaps			0x040(%%rbx),%%zmm3 		\n\t"/*    t14;    t15; */\
		"vbroadcastsd	0x008(%%rsi),%%zmm15		\n\t"/* load __sc  */\
		"vmovaps		     (%%rcx),%%zmm4 		\n\t	vmovaps			0x040(%%rcx),%%zmm5 		\n\t"/*    t22;    t23; */\
		"vmovaps		     (%%rdx),%%zmm6 		\n\t	vmovaps			0x040(%%rdx),%%zmm7 		\n\t"/*    t30;    t31; */\
		"vsubpd		%%zmm3 ,%%zmm2 ,%%zmm10 		\n\t"/* _c = t14-t15; */\
		"vaddpd		%%zmm3 ,%%zmm2 ,%%zmm11			\n\t"/* _d = t14+t15; */\
		"vbroadcastsd	0x0d0(%%rsi),%%zmm14		\n\t"/* load __c31 */\
		"vmovaps			 %%zmm5 ,%%zmm12 		\n\t	vmovaps				 %%zmm4 ,%%zmm13		\n\t"/* _e = t23; _f = t22;*/\
		" vfmadd231pd	%%zmm4 ,%%zmm15,%%zmm12		\n\t	 vfmsub231pd	%%zmm5 ,%%zmm15,%%zmm13		\n\t"/*	 FMS231(t22,__sc,_e );		 FMA231(t23,__sc,_f );*/\
		"vmovaps			 %%zmm6 ,%%zmm8  		\n\t	vmovaps				 %%zmm7 ,%%zmm9 		\n\t"/* _a = t30; _b = t31; */\
		" vfmadd231pd	%%zmm7 ,%%zmm15,%%zmm8 		\n\t	vfnmadd231pd	%%zmm6 ,%%zmm15,%%zmm9 		\n\t"/*	FNMA231(t31,__sc,_a );		 FMA231(t30,__sc,_b );*/\
		"vbroadcastsd	0x018(%%rsi),%%zmm6 		\n\t"/* load __c2i2 */\
		"vbroadcastsd	(%%rsi),%%zmm15				\n\t"/* load __c1_c */\
		"vmovaps		     (%%rax),%%zmm0 		\n\t	vmovaps			0x040(%%rax),%%zmm1 		\n\t"/*    t06;    t07; */\
		"vmovaps			 %%zmm1 ,%%zmm3 		\n\t	vmovaps				 %%zmm0 ,%%zmm2 		\n\t"/* t15= t07;	t14= t06; */\
		"vfnmadd231pd	%%zmm11,%%zmm6 ,%%zmm1 		\n\t	vfnmadd231pd	%%zmm10,%%zmm6 ,%%zmm0 		\n\t"/*	FNMA231(_d ,__c2i2,t07);	FNMA231(_c ,__c2i2,t06); */\
		"vmovaps			 %%zmm12,%%zmm4 		\n\t	vmovaps				 %%zmm13,%%zmm5 		\n\t"/* t22= _e; t23= _f; */\
		"vfnmadd231pd	%%zmm8 ,%%zmm14,%%zmm4 		\n\t	vfnmadd231pd	%%zmm9 ,%%zmm14,%%zmm5 		\n\t"/*	FNMA231(_a ,__c31 ,t22);	FNMA231(_b ,__c31 ,t23); */\
		" vfmadd231pd	%%zmm8 ,%%zmm14,%%zmm12		\n\t	 vfmadd231pd	%%zmm9 ,%%zmm14,%%zmm13		\n\t"/*	 FMA231(_a ,__c31 ,_e );	 FMA231(_b ,__c31 ,_f ); */\
		" vfmadd231pd	%%zmm10,%%zmm6 ,%%zmm2 		\n\t	 vfmadd231pd	%%zmm11,%%zmm6 ,%%zmm3 		\n\t"/*	 FMA231(_c ,__c2i2,t14);	 FMA231(_d ,__c2i2,t15); */\
		"vmovaps			 %%zmm0 ,%%zmm8 		\n\t	vmovaps				 %%zmm1 ,%%zmm9 		\n\t"/* _a = t06; _b = t07; */\
		" vfmadd231pd	%%zmm4 ,%%zmm15,%%zmm0 		\n\t	 vfmadd231pd	%%zmm5 ,%%zmm15,%%zmm1 		\n\t"/*	 FMA231(t22,__c1_c,t06);	 FMA231(t23,__c1_c,t07); */\
		"vfnmadd231pd	%%zmm4 ,%%zmm15,%%zmm8 		\n\t	vfnmadd231pd	%%zmm5 ,%%zmm15,%%zmm9 		\n\t"/*	FNMA231(t22,__c1_c,_a );	FNMA231(t23,__c1_c,_b ); */\
		"vmovaps			 %%zmm2 ,%%zmm10		\n\t	vmovaps				 %%zmm3 ,%%zmm11		\n\t"/* _c = t14; _d = t15; */\
		"vfnmadd231pd	%%zmm13,%%zmm15,%%zmm2 		\n\t	 vfmadd231pd	%%zmm12,%%zmm15,%%zmm3 		\n\t"/*	FNMA231(_f ,__c1_c,t14);	 FMA231(_e ,__c1_c,t15); */\
		" vfmadd231pd	%%zmm13,%%zmm15,%%zmm10		\n\t	vfnmadd231pd	%%zmm12,%%zmm15,%%zmm11		\n\t"/*	 FMA231(_f ,__c1_c,_c );	FNMA231(_e ,__c1_c,_d ); */\
		/* Write outputs: Swap 7/F outputs for DIT */\
		"addq	$%c[__o2],%%r10	\n\t"/* __out0 + 3*ostride */\
		"addq	$%c[__o2],%%r12	\n\t"/* __out0 + 7*ostride */\
		"addq	$%c[__o2],%%r11	\n\t"/* __out0 + b*ostride */\
		"addq	$%c[__o2],%%r13	\n\t"/* __out0 + f*ostride */\
		"vmovaps		%%zmm0 ,     (%%r10)		\n\t	vmovaps			%%zmm1 ,0x040(%%r10)		\n\t"/* __B3r= t06;		__B3i= t07; */\
		"vmovaps		%%zmm8 ,     (%%r11)		\n\t	vmovaps			%%zmm9 ,0x040(%%r11)		\n\t"/* __BBr= _a ;		__BBi= _b ; */\
		"vmovaps		%%zmm2 ,     (%%r13)		\n\t	vmovaps			%%zmm3 ,0x040(%%r13)		\n\t"/* __BFr= t14;		__BFi= t15; */\
		"vmovaps		%%zmm10,     (%%r12)		\n\t	vmovaps			%%zmm11,0x040(%%r12)		\n\t"/* __B7r= _c ;		__B7i= _d ; */\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__o1] "e" (Xo1)\
		 ,[__o2] "e" (Xo2)\
		 ,[__o3] "e" (Xo3)\
		 ,[__o4] "e" (Xo4)\
		 ,[__cc0] "m" (Xcc0)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// Doubled-data version of PAIR_SQUARE_4_SSE2, taking advantage of 2-per-cycle thruput of AVX2 FMAs:
	#define PAIR_SQUARE_4_AVX2(XtAr, XtBr, XtCr, XtDr, Xc0, Xs0, XuAr, XuBr, XuCr, XuDr, Xc1, Xs1, Xforth)\
	{\
	__asm__ volatile (\
		"movq	%[__tDr]	,%%rdx							\n\t	movq	%[__uDr]	,%%r13		\n\t"\
		"movq	%[__tAr]	,%%rax							\n\t	movq	%[__uAr]	,%%r10		\n\t"\
			"movq	%[__tCr]	,%%rcx							\n\t	movq	%[__uCr]	,%%r12		\n\t"\
			"movq	%[__tBr]	,%%rbx							\n\t	movq	%[__uBr]	,%%r11		\n\t"\
		"vmovaps	    (%%rdx),%%zmm4						\n\t	vmovaps	    (%%r13),%%zmm12		\n\t"\
		"vmovaps	0x40(%%rdx),%%zmm5						\n\t	vmovaps	0x40(%%r13),%%zmm13		\n\t"\
		"vshufpd	$0x55,%%zmm4,%%zmm4,%%zmm4				\n\t	vshufpd	$0x55,%%zmm12,%%zmm12,%%zmm12		\n\t"\
		"vshufpd	$0x55,%%zmm5,%%zmm5,%%zmm5				\n\t	vshufpd	$0x55,%%zmm13,%%zmm13,%%zmm13		\n\t"\
		"vmovaps	    (%%rax),%%zmm6						\n\t	vmovaps	    (%%r10),%%zmm14		\n\t"\
		"vmovaps	0x40(%%rax),%%zmm7						\n\t	vmovaps	0x40(%%r10),%%zmm15		\n\t"\
		"vmulpd				%%zmm6 ,%%zmm4,%%zmm0			\n\t	vmulpd				%%zmm14,%%zmm12,%%zmm8 	\n\t"\
		"vmulpd				%%zmm7 ,%%zmm4,%%zmm1			\n\t	vmulpd				%%zmm15,%%zmm12,%%zmm9 	\n\t"\
		" vfmadd231pd		%%zmm7 ,%%zmm5,%%zmm0			\n\t	 vfmadd231pd		%%zmm15,%%zmm13,%%zmm8 	\n\t"\
		"vfnmadd231pd		%%zmm6 ,%%zmm5,%%zmm1			\n\t	vfnmadd231pd		%%zmm14,%%zmm13,%%zmm9 	\n\t"\
		"vmovaps	    (%%rcx),%%zmm6						\n\t	vmovaps	    (%%r12),%%zmm14		\n\t"\
		"vmovaps	0x40(%%rcx),%%zmm7						\n\t	vmovaps	0x40(%%r12),%%zmm15		\n\t"\
		"vshufpd	$0x55,%%zmm6,%%zmm6,%%zmm6				\n\t	vshufpd	$0x55,%%zmm14,%%zmm14,%%zmm14		\n\t"\
		"vshufpd	$0x55,%%zmm7,%%zmm7,%%zmm7				\n\t	vshufpd	$0x55,%%zmm15,%%zmm15,%%zmm15		\n\t"\
		"vmovaps	    (%%rbx),%%zmm4						\n\t	vmovaps	    (%%r11),%%zmm12		\n\t"\
		"vmovaps	0x40(%%rbx),%%zmm5						\n\t	vmovaps	0x40(%%r11),%%zmm13		\n\t"\
		"vmulpd				%%zmm4 ,%%zmm6,%%zmm2			\n\t	vmulpd				%%zmm12,%%zmm14,%%zmm10	\n\t"\
		"vmulpd				%%zmm5 ,%%zmm6,%%zmm3			\n\t	vmulpd				%%zmm13,%%zmm14,%%zmm11	\n\t"\
		" vfmadd231pd		%%zmm5 ,%%zmm7,%%zmm2			\n\t	 vfmadd231pd		%%zmm13,%%zmm15,%%zmm10	\n\t"\
		"vfnmadd231pd		%%zmm4 ,%%zmm7,%%zmm3			\n\t	vfnmadd231pd		%%zmm12,%%zmm15,%%zmm11	\n\t"\
		"\n\t"\
	"movq		%[__forth],%%rdi	\n\t"\
	"leaq	-0x40(%%rdi),%%rdi		\n\t"/* two */\
	"vmovaps	%%zmm11,0x180(%%rdi)	\n\t"/* Spill zmm11 datum in advance of batch-multiply below in which */\
		/* we use that reg to hold common multiplier ... slot 5 reg-widths above forth available for spills */\
		"vmovaps	    (%%rax)	,%%zmm4			\n\t	vmovaps	    (%%rbx)	,%%zmm6				\n\t	vmovaps	    (%%r10)	,%%zmm12			\n\t	vmovaps	    (%%r11)	,%%zmm14			\n\t"\
		"vmovaps	0x40(%%rax)	,%%zmm5			\n\t	vmovaps	0x40(%%rbx)	,%%zmm7				\n\t	vmovaps	0x40(%%r10)	,%%zmm13			\n\t	vmovaps	0x40(%%r11)	,%%zmm15			\n\t"\
		"vmulpd		%%zmm4,%%zmm4,%%zmm4		\n\t	vmulpd			%%zmm6,%%zmm6,%%zmm6	\n\t	vmulpd		%%zmm12,%%zmm12,%%zmm12		\n\t	vmulpd			%%zmm14,%%zmm14,%%zmm14	\n\t"\
		/* x^2 - y^2: */\
		"vfnmadd231pd	%%zmm5,%%zmm5,%%zmm4	\n\t	vfnmadd231pd	%%zmm7,%%zmm7,%%zmm6	\n\t	vfnmadd231pd	%%zmm13,%%zmm13,%%zmm12	\n\t	vfnmadd231pd	%%zmm15,%%zmm15,%%zmm14	\n\t"\
		"vmovaps	    (%%rax)	,%%zmm5			\n\t	vmovaps	    (%%rbx)	,%%zmm7				\n\t	vmovaps	    (%%r10)	,%%zmm13			\n\t	vmovaps	    (%%r11)	,%%zmm15			\n\t"\
		"vmulpd		0x40(%%rax)	,%%zmm5,%%zmm5	\n\t	vmulpd		0x40(%%rbx)	,%%zmm7,%%zmm7	\n\t	vmulpd		0x40(%%r10)	,%%zmm13,%%zmm13\n\t	vmulpd		0x40(%%r11)	,%%zmm15,%%zmm15\n\t"\
		"vaddpd		%%zmm5		,%%zmm5,%%zmm5	\n\t	vaddpd		%%zmm7		,%%zmm7,%%zmm7	\n\t	vaddpd		%%zmm13		,%%zmm13,%%zmm13\n\t	vaddpd		%%zmm15		,%%zmm15,%%zmm15\n\t"\
		"vmovaps	%%zmm4	,    (%%rax)		\n\t	vmovaps	%%zmm6	,    (%%rbx)			\n\t	vmovaps	%%zmm12	,    (%%r10)			\n\t	vmovaps	%%zmm14	,    (%%r11)			\n\t"\
		"vmovaps	%%zmm5	,0x40(%%rax)		\n\t	vmovaps	%%zmm7	,0x40(%%rbx)			\n\t	vmovaps	%%zmm13	,0x40(%%r10)			\n\t	vmovaps	%%zmm15	,0x40(%%r11)			\n\t"\
	"vmovaps	(%%rdi),%%zmm11	\n\t"/* the common multiplier. */\
		" vfmsub132pd	%%zmm11,%%zmm4,%%zmm0	\n\t	 vfmsub132pd	%%zmm11,%%zmm6,%%zmm2	\n\t	 vfmsub132pd	%%zmm11,%%zmm12,%%zmm8 	\n\t	vfmsub132pd	%%zmm11,%%zmm14,%%zmm10		\n\t"\
		" vfmsub132pd	%%zmm11,%%zmm5,%%zmm1	\n\t	 vfmsub132pd	%%zmm11,%%zmm7,%%zmm3	\n\t	 vfmsub132pd	%%zmm11,%%zmm13,%%zmm9 	\n\t	vfmsub132pd 0x180(%%rdi),%%zmm15,%%zmm11\n\t"/* Restore spilled zmm11-datum via mem-multiplicand */\
		"										\n\t"\
		"vmovaps	    (%%rdx)	,%%zmm4			\n\t	vmovaps	    (%%rcx)	,%%zmm6				\n\t	vmovaps	    (%%r13)	,%%zmm12			\n\t	vmovaps	    (%%r12)	,%%zmm14			\n\t"\
		"vmovaps	0x40(%%rdx)	,%%zmm5			\n\t	vmovaps	0x40(%%rcx)	,%%zmm7				\n\t	vmovaps	0x40(%%r13)	,%%zmm13			\n\t	vmovaps	0x40(%%r12)	,%%zmm15			\n\t"\
		"vmulpd			%%zmm4,%%zmm4,%%zmm4	\n\t	vmulpd			%%zmm6,%%zmm6,%%zmm6	\n\t	vmulpd			%%zmm12,%%zmm12,%%zmm12	\n\t	vmulpd			%%zmm14,%%zmm14,%%zmm14	\n\t"\
		"vfnmadd231pd	%%zmm5,%%zmm5,%%zmm4	\n\t	vfnmadd231pd	%%zmm7,%%zmm7,%%zmm6	\n\t	vfnmadd231pd	%%zmm13,%%zmm13,%%zmm12	\n\t	vfnmadd231pd	%%zmm15,%%zmm15,%%zmm14	\n\t"\
		"vmovaps	    (%%rdx)	,%%zmm5			\n\t	vmovaps	    (%%rcx)	,%%zmm7				\n\t	vmovaps	    (%%r13)	,%%zmm13			\n\t	vmovaps	    (%%r12)	,%%zmm15			\n\t"\
		"vmulpd		0x40(%%rdx)	,%%zmm5,%%zmm5	\n\t	vmulpd		0x40(%%rcx)	,%%zmm7,%%zmm7	\n\t	vmulpd		0x40(%%r13)	,%%zmm13,%%zmm13\n\t	vmulpd		0x40(%%r12)	,%%zmm15,%%zmm15\n\t"\
		"vaddpd		%%zmm5		,%%zmm5,%%zmm5	\n\t	vaddpd		%%zmm7		,%%zmm7,%%zmm7	\n\t	vaddpd		%%zmm13		,%%zmm13,%%zmm13\n\t	vaddpd		%%zmm15		,%%zmm15,%%zmm15\n\t"\
		"vmovaps	%%zmm4	,    (%%rdx)		\n\t	vmovaps	%%zmm6	,    (%%rcx)			\n\t	vmovaps	%%zmm12	,    (%%r13)			\n\t	vmovaps	%%zmm14	,    (%%r12)			\n\t"\
		"vmovaps	%%zmm5	,0x40(%%rdx)		\n\t	vmovaps	%%zmm7	,0x40(%%rcx)			\n\t	vmovaps	%%zmm13	,0x40(%%r13)			\n\t	vmovaps	%%zmm15	,0x40(%%r12)			\n\t"\
		"vshufpd	$0x55,%%zmm4,%%zmm4,%%zmm4	\n\t	vshufpd	$0x55,%%zmm6,%%zmm6,%%zmm6		\n\t	vshufpd	$0x55,%%zmm12,%%zmm12,%%zmm12	\n\t	vshufpd	$0x55,%%zmm14,%%zmm14,%%zmm14	\n\t"\
		"vshufpd	$0x55,%%zmm5,%%zmm5,%%zmm5	\n\t	vshufpd	$0x55,%%zmm7,%%zmm7,%%zmm7		\n\t	vshufpd	$0x55,%%zmm13,%%zmm13,%%zmm13	\n\t	vshufpd	$0x55,%%zmm15,%%zmm15,%%zmm15	\n\t"\
		"vsubpd	%%zmm4,%%zmm0,%%zmm0			\n\t	vsubpd	%%zmm6,%%zmm2,%%zmm2			\n\t	vsubpd	%%zmm12,%%zmm8 ,%%zmm8 			\n\t	vsubpd	%%zmm14,%%zmm10,%%zmm10			\n\t"\
		"vaddpd	%%zmm5,%%zmm1,%%zmm1			\n\t	vaddpd	%%zmm7,%%zmm3,%%zmm3			\n\t	vaddpd	%%zmm13,%%zmm9 ,%%zmm9 			\n\t	vaddpd	%%zmm15,%%zmm11,%%zmm11			\n\t"\
		"\n\t"\
		"movq	%[__c0]		,%%rax				\n\t	movq	%[__c1]		,%%r10		\n\t"\
		"movq	%[__s0]		,%%rbx				\n\t	movq	%[__s1]		,%%r11		\n\t"\
		"vmovaps	    (%%rax),%%zmm6			\n\t	vmovaps	    (%%r10),%%zmm14		\n\t"\
		"vmovaps		(%%rbx),%%zmm7			\n\t	vmovaps		(%%r11),%%zmm15		\n\t"\
	"leaq	0x40(%%rdi),%%rdi	\n\t"/* forth, from two */\
		"vmovaps	%%zmm0		,%%zmm4			\n\t	vmovaps	%%zmm8 		,%%zmm12		\n\t"\
		"vmovaps	%%zmm1		,%%zmm5			\n\t	vmovaps	%%zmm9 		,%%zmm13		\n\t"\
		" vfmadd132pd	%%zmm6 ,%%zmm4,%%zmm0	\n\t	 vfmadd132pd	%%zmm14,%%zmm12,%%zmm8 		\n\t"\
		" vfmadd132pd	%%zmm6 ,%%zmm5,%%zmm1	\n\t	 vfmadd132pd	%%zmm14,%%zmm13,%%zmm9 		\n\t"\
		"vfnmadd231pd	%%zmm7 ,%%zmm5,%%zmm0	\n\t	vfnmadd231pd	%%zmm15,%%zmm13,%%zmm8 		\n\t"\
		" vfmadd231pd	%%zmm7 ,%%zmm4,%%zmm1	\n\t	 vfmadd231pd	%%zmm15,%%zmm12,%%zmm9 		\n\t"\
		"vmovaps	%%zmm2	,%%zmm4				\n\t	vmovaps	%%zmm10	,%%zmm12		\n\t"\
		"vmovaps	%%zmm3	,%%zmm5				\n\t	vmovaps	%%zmm11	,%%zmm13		\n\t"\
		" vfmsub132pd	%%zmm7 ,%%zmm4,	%%zmm2	\n\t	 vfmsub132pd	%%zmm15,%%zmm12,%%zmm10		\n\t"\
		" vfmsub132pd	%%zmm7 ,%%zmm5,	%%zmm3	\n\t	 vfmsub132pd	%%zmm15,%%zmm13,%%zmm11		\n\t"\
		" vfmadd231pd	%%zmm6 ,%%zmm5,	%%zmm2	\n\t	 vfmadd231pd	%%zmm14,%%zmm13,%%zmm10		\n\t"\
		"vfnmadd231pd	%%zmm6 ,%%zmm4,	%%zmm3	\n\t	vfnmadd231pd	%%zmm14,%%zmm12,%%zmm11		\n\t"\
		"vmovaps	(%%rdi),%%zmm4	\n\t"/* 0.25 */\
		"vmulpd	%%zmm4,%%zmm0,%%zmm0						\n\t	vmulpd	%%zmm4,%%zmm8 ,%%zmm8 		\n\t"\
		"vmulpd	%%zmm4,%%zmm1,%%zmm1						\n\t	vmulpd	%%zmm4,%%zmm9 ,%%zmm9 		\n\t"\
		"vmulpd	%%zmm4,%%zmm2,%%zmm2						\n\t	vmulpd	%%zmm4,%%zmm10,%%zmm10		\n\t"\
		"vmulpd	%%zmm4,%%zmm3,%%zmm3						\n\t	vmulpd	%%zmm4,%%zmm11,%%zmm11		\n\t"\
		"\n\t"\
		"movq	%[__tAr]	,%%rax							\n\t	movq	%[__uAr]	,%%r10		\n\t"\
		"movq	%[__tBr]	,%%rbx							\n\t	movq	%[__uBr]	,%%r11		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rax)	,%%zmm4						\n\t	vmovaps	    (%%r10)	,%%zmm12		\n\t"\
		"vmovaps	0x40(%%rax)	,%%zmm5						\n\t	vmovaps	0x40(%%r10)	,%%zmm13		\n\t"\
		"vmovaps	    (%%rbx)	,%%zmm6						\n\t	vmovaps	    (%%r11)	,%%zmm14		\n\t"\
		"vmovaps	0x40(%%rbx)	,%%zmm7						\n\t	vmovaps	0x40(%%r11)	,%%zmm15		\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4						\n\t	vaddpd	%%zmm8 ,%%zmm12,%%zmm12		\n\t"\
		"vaddpd	%%zmm1,%%zmm5,%%zmm5						\n\t	vaddpd	%%zmm9 ,%%zmm13,%%zmm13		\n\t"\
		"vsubpd	%%zmm2,%%zmm6,%%zmm6						\n\t	vsubpd	%%zmm10,%%zmm14,%%zmm14		\n\t"\
		"vsubpd	%%zmm3,%%zmm7,%%zmm7						\n\t	vsubpd	%%zmm11,%%zmm15,%%zmm15		\n\t"\
		"vmovaps	%%zmm4	,    (%%rax)					\n\t	vmovaps	%%zmm12	,    (%%r10)	\n\t"\
		"vmovaps	%%zmm5	,0x40(%%rax)					\n\t	vmovaps	%%zmm13	,0x40(%%r10)	\n\t"\
		"vmovaps	%%zmm6	,    (%%rbx)					\n\t	vmovaps	%%zmm14	,    (%%r11)	\n\t"\
		"vmovaps	%%zmm7	,0x40(%%rbx)					\n\t	vmovaps	%%zmm15	,0x40(%%r11)	\n\t"\
		"\n\t"\
		"movq	%[__tCr]	,%%rcx							\n\t	movq	%[__uCr]	,%%r12		\n\t"\
		"movq	%[__tDr]	,%%rdx							\n\t	movq	%[__uDr]	,%%r13		\n\t"\
		"\n\t"\
		"vshufpd	$0x55,%%zmm0,%%zmm0,%%zmm0				\n\t	vshufpd	$0x55,%%zmm8 ,%%zmm8 ,%%zmm8 		\n\t"\
		"vshufpd	$0x55,%%zmm1,%%zmm1,%%zmm1				\n\t	vshufpd	$0x55,%%zmm9 ,%%zmm9 ,%%zmm9 		\n\t"\
		"vshufpd	$0x55,%%zmm2,%%zmm2,%%zmm2				\n\t	vshufpd	$0x55,%%zmm10,%%zmm10,%%zmm10		\n\t"\
		"vshufpd	$0x55,%%zmm3,%%zmm3,%%zmm3				\n\t	vshufpd	$0x55,%%zmm11,%%zmm11,%%zmm11		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rdx)	,%%zmm4						\n\t	vmovaps	    (%%r13)	,%%zmm12		\n\t"\
		"vmovaps	0x40(%%rdx)	,%%zmm5						\n\t	vmovaps	0x40(%%r13)	,%%zmm13		\n\t"\
		"vmovaps	    (%%rcx)	,%%zmm6						\n\t	vmovaps	    (%%r12)	,%%zmm14		\n\t"\
		"vmovaps	0x40(%%rcx)	,%%zmm7						\n\t	vmovaps	0x40(%%r12)	,%%zmm15		\n\t"\
		"vaddpd	%%zmm0,%%zmm4,%%zmm4						\n\t	vaddpd	%%zmm8 ,%%zmm12,%%zmm12		\n\t"\
		"vsubpd	%%zmm1,%%zmm5,%%zmm5						\n\t	vsubpd	%%zmm9 ,%%zmm13,%%zmm13		\n\t"\
		"vsubpd	%%zmm2,%%zmm6,%%zmm6						\n\t	vsubpd	%%zmm10,%%zmm14,%%zmm14		\n\t"\
		"vaddpd	%%zmm3,%%zmm7,%%zmm7						\n\t	vaddpd	%%zmm11,%%zmm15,%%zmm15		\n\t"\
		"vmovaps	%%zmm4	,    (%%rdx)					\n\t	vmovaps	%%zmm12	,    (%%r13)	\n\t"\
		"vmovaps	%%zmm5	,0x40(%%rdx)					\n\t	vmovaps	%%zmm13	,0x40(%%r13)	\n\t"\
		"vmovaps	%%zmm6	,    (%%rcx)					\n\t	vmovaps	%%zmm14	,    (%%r12)	\n\t"\
		"vmovaps	%%zmm7	,0x40(%%rcx)					\n\t	vmovaps	%%zmm15	,0x40(%%r12)	\n\t"\
		:					/* outputs: none */\
		: [__tAr] "m" (XtAr)	/* All inputs from memory addresses here */\
		 ,[__tBr] "m" (XtBr)\
		 ,[__tCr] "m" (XtCr)\
		 ,[__tDr] "m" (XtDr)\
		 ,[__c0] "m" (Xc0)\
		 ,[__s0] "m" (Xs0)\
		 ,[__uAr] "m" (XuAr)\
		 ,[__uBr] "m" (XuBr)\
		 ,[__uCr] "m" (XuCr)\
		 ,[__uDr] "m" (XuDr)\
		 ,[__c1] "m" (Xc1)\
		 ,[__s1] "m" (Xs1)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// Sep 2019: 2-input FFT(a)*FFT(b) version of above PAIR_SQUARE_4_SSE2 macro, based on above ARM SIMD version of PAIR_MUL_4_SSE2.
	// NOTE: Unlike the PAIR_SQUARE_4 version of this macro, the MUL version assumes the sincos terms premultiplied by 1/4!
	// AVX-512 version has shufpd immediate = 0x55 = 01010101_2, which is the doubled analog of the AVX imm8 = 0x5 = 0101_2:
	#define PAIR_MUL_4_SSE2(XA0,XA1,XA2,XA3, XB0,XB1,XB2,XB3, Xc,Xs,Xforth)\
	{\
	__asm__ volatile (\
		/* Load a2,a3 and b2,b3, d0,d1-swap, then compute
			t0 = ~a3r*~b3r - ~a3i*~b3i, t2 = ~a3r*~b3i + ~a3i*~b3r
			t1 = ~a2r*~b2r - ~a2i*~b2i, t3 = ~a2r*~b2i + ~a2i*~b2r
		*/\
		"movq	%[__A2]	,%%rcx	\n\t"\
		"movq	%[__A3]	,%%rdx	\n\t"\
		"movq	%[__B2]	,%%rdi	\n\t"\
		"movq	%[__B3]	,%%rsi	\n\t"\
		/* Must load double-pairs-to-be-swapped into regs first, since SHUFPD takes low double from DEST and high from SRC: */\
		"vmovaps	    (%%rcx),%%zmm0		\n\t	vshufpd	$0x55,%%zmm0,%%zmm0,%%zmm0	\n\t"/* ~a2r */\
		"vmovaps	0x40(%%rcx),%%zmm1		\n\t	vshufpd	$0x55,%%zmm1,%%zmm1,%%zmm1	\n\t"/* ~a2i */\
		"vmovaps	    (%%rdi),%%zmm4		\n\t	vshufpd	$0x55,%%zmm4,%%zmm4,%%zmm4	\n\t"/* ~b2r */\
		"vmovaps	0x40(%%rdi),%%zmm5		\n\t	vshufpd	$0x55,%%zmm5,%%zmm5,%%zmm5	\n\t"/* ~b2i */\
		"vmovaps	    (%%rdx),%%zmm2		\n\t	vshufpd	$0x55,%%zmm2,%%zmm2,%%zmm2	\n\t"/* ~a3r */\
		"vmovaps	0x40(%%rdx),%%zmm3		\n\t	vshufpd	$0x55,%%zmm3,%%zmm3,%%zmm3	\n\t"/* ~a3i */\
		"vmovaps	    (%%rsi),%%zmm6		\n\t	vshufpd	$0x55,%%zmm6,%%zmm6,%%zmm6	\n\t"/* ~b3r */\
		"vmovaps	0x40(%%rsi),%%zmm7		\n\t	vshufpd	$0x55,%%zmm7,%%zmm7,%%zmm7	\n\t"/* ~b3i */\
		"vmulpd		%%zmm0	,%%zmm4	,%%zmm8	\n\t"/* ~a2r*~b2r */\
		"vmulpd		%%zmm0	,%%zmm5	,%%zmm9	\n\t"/* ~a2r*~b2i */\
		"vmulpd		%%zmm2	,%%zmm6	,%%zmm10\n\t"/* ~a3r*~b3r */\
		"vmulpd		%%zmm2	,%%zmm7	,%%zmm11\n\t"/* ~a3r*~b3i */\
	"vfnmadd231pd	%%zmm1	,%%zmm5	,%%zmm8	\n\t"/* t1 = ~a2r*~b2r - ~a2i*~b2i */\
	"vfmadd231pd	%%zmm1	,%%zmm4	,%%zmm9	\n\t"/* t3 = ~a2r*~b2i + ~a2i*~b2r */\
	"vfnmadd231pd	%%zmm3	,%%zmm7	,%%zmm10\n\t"/* t0 = ~a3r*~b3r - ~a3i*~b3i */\
	"vfmadd231pd	%%zmm3	,%%zmm6	,%%zmm11\n\t"/* t2 = ~a3r*~b3i + ~a3i*~b3r */\
		/* t1,3 and t0,2 not needed until final butterfly sequence, so write back to A2,3 memlocs: */\
		"vmovaps	%%zmm8	,    (%%rcx)	\n\t	movq	%[__A0]	,%%rax	\n\t"\
		"vmovaps	%%zmm9	,0x40(%%rcx)	\n\t	movq	%[__A1]	,%%rbx	\n\t"\
		"vmovaps	%%zmm10	,    (%%rdx)	\n\t	movq	%[__B0]	,%%rdi	\n\t"\
		"vmovaps	%%zmm11	,0x40(%%rdx)	\n\t	movq	%[__B1]	,%%rsi	\n\t"\
	/* a2,3 in zmm0-3, b2,3 in zmm4-7, t1,3 in (rcx), t0,2 in (rdx) */\
		/* calculate difference terms...these need the [a,b][2|3] vector-data to be d0,1-swapped:
			~a3r -= a0r, ~a3i += a0i,
			~a2r -= a1r, ~a2i += a1i, similar for b-data, but move ~b2 -+ b1 down to just before a1*b1 cmul to free up 2 regs.
		*/\
/*** Need ~a3r = a0r - ~a3r, not ~a3r -= a0r! [Similar for a2r,b3r,b2r] ***
************** As currently, a2r,a3r,b2r,b3r all negated! ****************/\
		"vmovaps	    (%%rax)	,%%zmm8		\n\t	vsubpd	%%zmm8	,%%zmm2	,%%zmm2	\n\t"/* ~a3r -= a0r */\
		"vmovaps	0x40(%%rax)	,%%zmm9		\n\t	vaddpd	%%zmm9	,%%zmm3	,%%zmm3	\n\t"/* ~a3i += a0i */\
		"vmovaps	    (%%rbx)	,%%zmm10	\n\t	vsubpd	%%zmm10	,%%zmm0	,%%zmm0	\n\t"/* ~a2r -= a1r */\
		"vmovaps	0x40(%%rbx)	,%%zmm11	\n\t	vaddpd	%%zmm11	,%%zmm1	,%%zmm1	\n\t"/* ~a2i += a1i */\
		"vmovaps	    (%%rdi)	,%%zmm12	\n\t	vsubpd	%%zmm12	,%%zmm6	,%%zmm6	\n\t"/* ~b3r -= b0r */\
		"vmovaps	0x40(%%rdi)	,%%zmm13	\n\t	vaddpd	%%zmm13	,%%zmm7	,%%zmm7	\n\t"/* ~b3i += b0i */\
		"vmovaps	    (%%rsi)	,%%zmm14	\n\t	vsubpd	%%zmm14	,%%zmm4	,%%zmm4	\n\t"/* ~b2r -= b1r */\
		"vmovaps	0x40(%%rsi)	,%%zmm15	\n\t	vaddpd	%%zmm15	,%%zmm5	,%%zmm5	\n\t"/* ~b2i += b1i */\
		/* now calculate 1st square-like term and store back in H(j) slot:
			t4 = a0r*b0r - a0i*b0i, a0i = a0r*b0i + a0i*b0r, a0r = t4
			t5 = a1r*b1r - a1i*b1i, a1i = a1r*b1i + a1i*b1r, a1r = t5
		*/\
		"vmulpd		    (%%rax)	,%%zmm12,%%zmm8	\n\t"/* a0r*b0r */\
		"vmulpd		    (%%rax)	,%%zmm13,%%zmm9	\n\t"/* a0r*b0i */\
		"vmulpd		    (%%rbx)	,%%zmm14,%%zmm10\n\t"/* a1r*b1r */\
		"vmulpd		    (%%rbx)	,%%zmm15,%%zmm11\n\t"/* a1r*b1i */\
	"vfnmadd231pd	0x40(%%rax)	,%%zmm13,%%zmm8	\n\t"/* a0r' = a0r*b0r - a0i*b0i */\
	"vfmadd231pd	0x40(%%rax)	,%%zmm12,%%zmm9	\n\t"/* a0i' = a0r*b0i + a0i*b0r */\
	"vfnmadd231pd	0x40(%%rbx)	,%%zmm15,%%zmm10\n\t"/* a1r' = a1r*b1r - a1i*b1i */\
	"vfmadd231pd	0x40(%%rbx)	,%%zmm14,%%zmm11\n\t"/* a1i' = a1r*b1i + a1i*b1r */\
	/* a0,1 in zmm8-11, a2,3 in zmm0-3, b2,3 in zmm4-7, t1,3 in (rcx), t0,2 in (rdx) */\
		/* calculate the complex products to build the second term:
			t4 = ~a3r*~b3r - ~a3i*~b3i, ~a3i = ~a3r*~b3i + ~a3i*~b3r, ~a3r,i in zmm2,3, ~b3r,i in zmm6,7
			t5 = ~a2r*~b2r - ~a2i*~b2i, ~a2i = ~a2r*~b2i + ~a2i*~b2r, ~arr,i in zmm0,1, ~b2r,i in zmm4,5
		*/\
/****************** a2r,a3r,b2r,b3r being negated means a2i,a3i come out negated ****************/\
		"vmulpd		%%zmm0	,%%zmm4	,%%zmm12\n\t"/* ~a2r*~b2r */\
		"vmulpd		%%zmm0	,%%zmm5	,%%zmm13\n\t"/* ~a2r*~b2i */\
		"vmulpd		%%zmm2	,%%zmm6	,%%zmm14\n\t"/* ~a3r*~b3r */\
		"vmulpd		%%zmm2	,%%zmm7	,%%zmm15\n\t"/* ~a3r*~b3i */\
	"vfnmadd231pd	%%zmm1	,%%zmm5	,%%zmm12\n\t"/* t5   = ~a2r*~b2r - ~a2i*~b2i */\
	"vfmadd231pd	%%zmm1	,%%zmm4	,%%zmm13\n\t"/* ~a2i = ~a2r*~b2i + ~a2i*~b2r */\
	"vfnmadd231pd	%%zmm3	,%%zmm7	,%%zmm14\n\t"/* t4   = ~a3r*~b3r - ~a3i*~b3i */\
	"vfmadd231pd	%%zmm3	,%%zmm6	,%%zmm15\n\t"/* ~a3i = ~a3r*~b3i + ~a3i*~b3r */\
		/* zmm0-7 free */\
		/* Assume [c0,s1],[s0,c1] sincos vector-data are in the [c] and [s]-input-pointers, then compute
			~a3r = [cc+0.25]*t4 - [ss]*~a3i, ~a3i = [ss]*t4 + [cc+0.25]*~a3i
			~a2r = [0.25-ss]*t5 - [cc]*~a2i, ~a2i = [cc]*t5 + [0.25-ss]*~a2i ,
		where cc = 0.25*[c0,s1] and ss = 0.25*[s0,c1]:
		*/\
/****************** a2i,a3i being negated requires +- sign swap in this next computation ****************/\
		"movq	%[__forth],%%rdi		\n\t	vmovaps	(%%rdi),%%zmm6		\n\t	vmovaps	%%zmm6,%%zmm7	\n\t"/* 2 copies of 0.25 */\
		"movq	%[__c]	,%%rdi			\n\t	vmovaps	(%%rdi),%%zmm4		\n\t"/*	cc assumed premultiplied by 0.25 */\
		"movq	%[__s]	,%%rsi			\n\t	vmovaps	(%%rsi),%%zmm5		\n\t"/*	ss assumed premultiplied by 0.25 */\
		"vaddpd	%%zmm4	,%%zmm6	,%%zmm6	\n\t	vsubpd	%%zmm5	,%%zmm7	,%%zmm7	\n\t"	/* [cc+0.25],[0.25-ss] in zmm6,7 */\
		"vmulpd		%%zmm14	,%%zmm6	,%%zmm2	\n\t"/*   t4*[cc+0.25] */\
		"vmulpd		%%zmm14	,%%zmm5	,%%zmm3	\n\t"/*   t4*[ss] */\
		"vmulpd		%%zmm12	,%%zmm7	,%%zmm0	\n\t"/*   t5*[0.25-ss] */\
		"vmulpd		%%zmm12	,%%zmm4	,%%zmm1	\n\t"/*   t5*[cc] */\
	"vfmadd231pd	%%zmm15	,%%zmm5	,%%zmm2	\n\t"/* ~a3r = [cc+0.25]*t4 - [ss]*~a3i in zmm2 */\
	"vfnmadd231pd	%%zmm15	,%%zmm6	,%%zmm3	\n\t"/* ~a3i = [cc+0.25]*~a3i - [ss]*t4 in zmm3 */\
	"vfmadd231pd	%%zmm13	,%%zmm4	,%%zmm0	\n\t"/* ~a2r = [0.25-ss]*t5 - [cc]*~a2i in zmm0 */\
	"vfnmadd231pd	%%zmm13	,%%zmm7	,%%zmm1	\n\t"/* ~a2i = [0.25-ss]*~a2i - [cc]*t5 in zmm1 */\
/****************** a2i,a3i in zmm1,3; *NOT* negated as in the sse2|avx case ****************/\
	/* a0,1 in zmm8-11, a2,3 in zmm0-3, t1,3 in (rcx), t0,2 in (rdx) */\
		"vmovaps	    (%%rdx)	,%%zmm4		\n\t"/* t0 */\
		"vmovaps	0x40(%%rdx)	,%%zmm5		\n\t"/* t2 */\
		"vmovaps	    (%%rcx)	,%%zmm6		\n\t"/* t1 */\
		"vmovaps	0x40(%%rcx)	,%%zmm7		\n\t"/* t3 */\
	/* and now complete and store the results:
		a0r -= ~a3r, a0i -= ~a3i
		a1r -= ~a2r, a1i -= ~a2i
	N-j terms:
		~a3r = t0 - ~a3r, ~a3i += t2
		~a2r = t1 - ~a2r, ~a2i += t3
	*/\
/****************** a2i,a3i negated means in rcol instead computing a0,1i += ~a3,2i, a3,2i = t2,3 - a3,2i ****************/\
		"vsubpd	%%zmm2	,%%zmm8	,%%zmm8	\n\t	vsubpd	%%zmm3	,%%zmm9	,%%zmm9	\n\t"	/* a0r,i in v8 ,9 ; ~a3r,i in v2,3 */\
		"vsubpd	%%zmm0	,%%zmm10,%%zmm10\n\t	vsubpd	%%zmm1	,%%zmm11,%%zmm11\n\t"	/* a1r,i in v10,11; ~a2r,i in v0,1 */\
		"vsubpd	%%zmm2	,%%zmm4	,%%zmm4	\n\t	vaddpd	%%zmm3	,%%zmm5	,%%zmm5	\n\t"	/* t0,2 in v4,5 */\
		"vsubpd	%%zmm0	,%%zmm6	,%%zmm6	\n\t	vaddpd	%%zmm1	,%%zmm7	,%%zmm7	\n\t"	/* t1,3 in v6,7 */\
	/* Interleave writes of a0,a1 with un-shufflings of ~a2,~a3: */\
		"vmovaps	%%zmm8	,    (%%rax)	\n\t	vshufpd	$0x55	,%%zmm4	,%%zmm4,%%zmm4	\n\t"/* ~a3r */\
		"vmovaps	%%zmm9	,0x40(%%rax)	\n\t	vshufpd	$0x55	,%%zmm5	,%%zmm5,%%zmm5	\n\t"/* ~a3i */\
		"vmovaps	%%zmm10	,    (%%rbx)	\n\t	vshufpd	$0x55	,%%zmm6	,%%zmm6,%%zmm6	\n\t"/* ~a2r */\
		"vmovaps	%%zmm11	,0x40(%%rbx)	\n\t	vshufpd	$0x55	,%%zmm7	,%%zmm7,%%zmm7	\n\t"/* ~a2i */\
		"vmovaps	%%zmm4	,    (%%rdx)	\n\t"\
		"vmovaps	%%zmm5	,0x40(%%rdx)	\n\t"\
		"vmovaps	%%zmm6	,    (%%rcx)	\n\t"\
		"vmovaps	%%zmm7	,0x40(%%rcx)	\n\t"\
		/* Cost (FMA = MUL): [43 vector-load/store (8 implicit), 12 shufpd, 18 addpd, 32 mulpd, 1 vector-register-copy] */\
		:					/* outputs: none */\
		: [__A0] "m" (XA0)	/* All inputs from memory addresses here */\
		 ,[__A1] "m" (XA1)\
		 ,[__A2] "m" (XA2)\
		 ,[__A3] "m" (XA3)\
		 ,[__B0] "m" (XB0)\
		 ,[__B1] "m" (XB1)\
		 ,[__B2] "m" (XB2)\
		 ,[__B3] "m" (XB3)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

#elif defined(USE_AVX)	// Some macros shared between AVX/AVX2 builds, so differentiate those inside this larger clause

  #ifdef USE_AVX2	// FMA-based versions of selected macros in this file for Intel AVX2/FMA3

	/* Complex multiply of 2 roots of unity - use e.g. for "multiply up" of sincos twiddles. */
	#define SSE2_CMUL_EXPO(XcA,XcB,XcAmB,XcApB)\
	{\
	__asm__ volatile (\
		"movq	%[__cA]		,%%rax\n\t"\
		"movq	%[__cB]		,%%rbx\n\t"\
		"movq	%[__cAmB]	,%%rcx\n\t"\
		"movq	%[__cApB]	,%%rdx\n\t"\
		"\n\t"\
		"vmovaps	    (%%rax),%%ymm0\n\t"\
		"vmovaps	0x20(%%rax),%%ymm2\n\t"\
		"vmovaps	    (%%rbx),%%ymm4\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5\n\t"\
		"vmovaps	%%ymm0,%%ymm1\n\t"\
		"vmovaps	%%ymm2,%%ymm3\n\t"\
		"\n\t"\
		"vmulpd	%%ymm4,%%ymm0,%%ymm0\n\t"\
		"vmulpd	%%ymm5,%%ymm1,%%ymm1\n\t"\
		"vmovaps	%%ymm0,%%ymm6\n\t"\
		"vmovaps	%%ymm1,%%ymm7\n\t"\
	" vfmadd231pd	%%ymm5,%%ymm3,%%ymm0	\n\t"\
	"vfnmadd231pd	%%ymm4,%%ymm2,%%ymm1	\n\t"\
	"vfnmadd231pd	%%ymm5,%%ymm3,%%ymm6	\n\t"\
	" vfmadd231pd	%%ymm4,%%ymm2,%%ymm7	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)\n\t"\
		"vmovaps	%%ymm6,    (%%rdx)\n\t"\
		"vmovaps	%%ymm7,0x20(%%rdx)\n\t"\
		:					/* outputs: none */\
		: [__cA]  "m" (XcA)	/* All inputs from memory addresses here */\
		 ,[__cB]  "m" (XcB)\
		 ,[__cAmB] "m" (XcAmB)\
		 ,[__cApB] "m" (XcApB)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_03_DFT_X2(Xcc0, Xi0,Xi1,Xi2, Xo0,Xo1,Xo2, Xj0,Xj1,Xj2, Xu0,Xu1,Xu2)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax	\n\t	movq	%[__j0],%%r10	\n\t"/*	movq	$0x3ff0000000000000, %%rdx	\n\t@// Int64 form of double-float 1.0 */\
		"movq	%[__i1],%%rbx	\n\t	movq	%[__j1],%%r11	\n\t"/*	vmovq	%%rdx, %%xmm15	\n\t@// Load into bottom half of xmm */\
		"movq	%[__i2],%%rcx	\n\t	movq	%[__j2],%%r12	\n\t"/*	vpbroadcastq	%%xmm15,%%ymm15	\n\t@// Broadcast lo half of xmm to all 4 64-bit slots of ymm */\
		"movq	%[__cc0],%%rdx	\n\t							\n\t"/* ewm: My version of gcc/asm gives assembler error: 'no such instruction' ... either update tools or use alternate */\
		"vmovaps	    (%%rbx),%%ymm2		\n\t	vmovaps	    (%%r11),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3		\n\t	vmovaps	0x20(%%r11),%%ymm11	\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t	vmovaps	    (%%r10),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t	vmovaps	0x20(%%r10),%%ymm9 	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6		\n\t	vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7		\n\t	vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vmovaps	%%ymm2,%%ymm4			\n\t	vmovaps	%%ymm10,%%ymm12		\n\t"\
		"vmovaps	%%ymm3,%%ymm5			\n\t	vmovaps	%%ymm11,%%ymm13		\n\t"\
		"movq	%[__o0],%%rax				\n\t	movq	%[__u0],%%r10		\n\t"\
		"movq	%[__o1],%%rbx				\n\t	movq	%[__u1],%%r11		\n\t"\
		"movq	%[__o2],%%rcx				\n\t	movq	%[__u2],%%r12		\n\t"\
		"vaddpd	%%ymm6,%%ymm2,%%ymm2		\n\t	vaddpd	%%ymm14,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm7,%%ymm3,%%ymm3		\n\t	vaddpd	%%ymm15,%%ymm11,%%ymm11		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t	vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t	vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0		\n\t	vaddpd	%%ymm10,%%ymm8 ,%%ymm8 		\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1		\n\t	vaddpd	%%ymm11,%%ymm9 ,%%ymm9 		\n\t"\
		"vmovaps	    (%%rdx),%%ymm6		\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7		\n\t"\
		"vmovaps	%%ymm0,    (%%rax)		\n\t	vmovaps	%%ymm8 ,    (%%r10)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rax)		\n\t	vmovaps	%%ymm9 ,0x20(%%r10)	\n\t"\
	" vfmadd132pd	%%ymm6,%%ymm0,%%ymm2 	\n\t  vfmadd132pd	%%ymm6,%%ymm8,%%ymm10	\n\t"\
	" vfmadd132pd	%%ymm6,%%ymm1,%%ymm3 	\n\t  vfmadd132pd	%%ymm6,%%ymm9,%%ymm11	\n\t"\
		"vmovaps	%%ymm2,%%ymm0			\n\t	vmovaps	%%ymm10,%%ymm8 		\n\t"\
		"vmovaps	%%ymm3,%%ymm1			\n\t	vmovaps	%%ymm11,%%ymm9 		\n\t"\
	" vfmadd231pd	%%ymm7,%%ymm5,%%ymm0 	\n\t  vfmadd231pd	%%ymm7,%%ymm13,%%ymm8 	\n\t"\
	"vfnmadd231pd	%%ymm7,%%ymm4,%%ymm1 	\n\t vfnmadd231pd	%%ymm7,%%ymm12,%%ymm9 	\n\t"\
	"vfnmadd231pd	%%ymm7,%%ymm5,%%ymm2 	\n\t vfnmadd231pd	%%ymm7,%%ymm13,%%ymm10	\n\t"\
	" vfmadd231pd	%%ymm7,%%ymm4,%%ymm3 	\n\t  vfmadd231pd	%%ymm7,%%ymm12,%%ymm11	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)		\n\t	vmovaps	%%ymm8 ,    (%%r12)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)		\n\t	vmovaps	%%ymm9 ,0x20(%%r12)	\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)		\n\t	vmovaps	%%ymm10,    (%%r11)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)		\n\t	vmovaps	%%ymm11,0x20(%%r11)	\n\t"\
		:					/* outputs: none */\
		: [__cc0] "m" (Xcc0)	/* All inputs from memory addresses here */\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		: "cc","memory","rax","rbx","rcx","rdx","r10","r11","r12","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_05_DFT_0TWIDDLE_X2(Xcc1,Xtwo, Xi0,Xi1,Xi2,Xi3,Xi4, Xo0,Xo1,Xo2,Xo3,Xo4, Xj0,Xj1,Xj2,Xj3,Xj4, Xu0,Xu1,Xu2,Xu3,Xu4)\
	{\
	__asm__ volatile (\
		"movq	%[__i1],%%rax				\n\t	movq	%[__j1],%%r11		\n\t"\
		"movq	%[__i4],%%rdx				\n\t	movq	%[__j4],%%r14		\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t	vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t	vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6		\n\t	vmovaps	    (%%r14),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7		\n\t	vmovaps	0x20(%%r14),%%ymm15	\n\t"\
		"movq	%[__i2],%%rbx				\n\t	movq	%[__j2],%%r12		\n\t"\
		"movq	%[__i3],%%rcx				\n\t	movq	%[__j3],%%r13		\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t	vsubpd		%%ymm14,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t	vsubpd		%%ymm15,%%ymm9 ,%%ymm9 		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2		\n\t	vmovaps	    (%%r12),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3		\n\t	vmovaps	0x20(%%r12),%%ymm11	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4		\n\t	vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5		\n\t"/*	vmovaps	0x20(%%r13),%%ymm13	\n\t Use for two in 8 ensuing FMAs instead */\
	"movq		%[__two],%%rcx		\n\t"\
	"vmovaps	(%%rcx),%%ymm13		\n\t"/* two */\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t	vsubpd		%%ymm12,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t	vsubpd	0x20(%%r13),%%ymm11,%%ymm11		\n\t"\
	" vfmadd132pd	%%ymm13,%%ymm0,%%ymm6 	\n\t  vfmadd132pd		%%ymm13,%%ymm8,%%ymm14	\n\t"\
	" vfmadd132pd	%%ymm13,%%ymm1,%%ymm7 	\n\t  vfmadd132pd		%%ymm13,%%ymm9,%%ymm15	\n\t"\
	" vfmadd132pd	%%ymm13,%%ymm2,%%ymm4 	\n\t  vfmadd132pd		%%ymm13,%%ymm10,%%ymm12	\n\t"\
	" vfmadd132pd	%%ymm13,%%ymm3,%%ymm5 	\n\t  vfmadd132pd	0x20(%%r13),%%ymm11,%%ymm13	\n\t"\
	/*==== spill ymm2,3 here (still use once as dest of add/sub below, but pvsly carried copies of values-here around) =====*/\
	"vmovaps	%%ymm2,    (%%rbx)		\n\t	vmovaps	%%ymm10,    (%%r12)		\n\t"\
	"vmovaps	%%ymm3,0x20(%%rbx)		\n\t	vmovaps	%%ymm11,0x20(%%r12)		\n\t"\
		"movq	%[__cc1],%%rax				\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6		\n\t	vsubpd	%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7		\n\t	vsubpd	%%ymm13,%%ymm15,%%ymm15		\n\t"\
		"movq	%[__i0],%%rsi				\n\t	movq	%[__j0],%%r10		\n\t"\
	" vfmadd132pd	(%%rcx),%%ymm6,%%ymm4 	\n\t  vfmadd132pd	(%%rcx),%%ymm14,%%ymm12	\n\t"\
	" vfmadd132pd	(%%rcx),%%ymm7,%%ymm5 	\n\t  vfmadd132pd	(%%rcx),%%ymm15,%%ymm13	\n\t"\
		"vaddpd	    (%%rsi),%%ymm4,%%ymm2	\n\t	vaddpd	    (%%r10),%%ymm12,%%ymm10	\n\t"\
		"vaddpd	0x20(%%rsi),%%ymm5,%%ymm3	\n\t	vaddpd	0x20(%%r10),%%ymm13,%%ymm11	\n\t"\
		"movq	%[__o0],%%rdi				\n\t	movq	%[__u0],%%r15		\n\t"\
		"vmovaps	%%ymm2,    (%%rdi)		\n\t	vmovaps	%%ymm10,    (%%r15)		\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdi)		\n\t	vmovaps	%%ymm11,0x20(%%r15)		\n\t"\
	"vmovaps	    (%%rax),%%ymm2	\n\t"/* each of these 2 mults used 4x, so trade 8 loads for 4: */\
	"vmovaps	0x20(%%rax),%%ymm3	\n\t"/* 2 of mults, 2 to read clobbered ymm2,3 values from mem */\
		"vmulpd			%%ymm3 ,%%ymm6,%%ymm6	\n\t	vmulpd		%%ymm3,%%ymm14,%%ymm14	\n\t"\
		"vmulpd			%%ymm3 ,%%ymm7,%%ymm7	\n\t	vmulpd		%%ymm3,%%ymm15,%%ymm15	\n\t"\
	" vfmadd213pd	    (%%rdi),%%ymm2,%%ymm4 	\n\t  vfmadd132pd	%%ymm2,%%ymm10,%%ymm12	\n\t"\
	" vfmadd213pd	0x20(%%rdi),%%ymm2,%%ymm5 	\n\t  vfmadd132pd	%%ymm2,%%ymm11,%%ymm13	\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t	vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t	vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
	" vfmadd132pd	(%%rcx),%%ymm4,%%ymm6 	\n\t  vfmadd132pd	(%%rcx),%%ymm12,%%ymm14	\n\t"\
	" vfmadd132pd	(%%rcx),%%ymm5,%%ymm7 	\n\t  vfmadd132pd	(%%rcx),%%ymm13,%%ymm15	\n\t"\
		"vmovaps	%%ymm4,    (%%rsi)		\n\t	vmovaps	%%ymm12,    (%%r10)		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rsi)		\n\t	vmovaps	%%ymm13,0x20(%%r10)		\n\t"\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps	%%ymm8 ,%%ymm12			\n\t"\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps	%%ymm9 ,%%ymm13			\n\t"\
	/*==== restore spill of ymm2,3 here =====*/\
	"vmovaps	    (%%rbx),%%ymm2		\n\t	vmovaps	    (%%r12),%%ymm10		\n\t"\
	"vmovaps	0x20(%%rbx),%%ymm3		\n\t	vmovaps	0x20(%%r12),%%ymm11		\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t	vsubpd	%%ymm10,%%ymm8,%%ymm8 		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t	vsubpd	%%ymm11,%%ymm9,%%ymm9 		\n\t"\
		"vmulpd	0x40(%%rax),%%ymm0,%%ymm0	\n\t	vmulpd	0x40(%%rax),%%ymm8 ,%%ymm8 	\n\t"\
		"vmulpd	0x40(%%rax),%%ymm1,%%ymm1	\n\t	vmulpd	0x40(%%rax),%%ymm9 ,%%ymm9 	\n\t"\
	/*==== spill ymm4,5 here (still use once as mult in 2 of the 8 FMAs below, but now read those values from the spill addrs) =====*/\
	"vmovaps	%%ymm4,    (%%rbx)	\n\t"\
	"vmovaps	%%ymm5,0x20(%%rbx)	\n\t"\
	"vmovaps	0x60(%%rax),%%ymm4	\n\t"/* each of these 2 mults used 4x, so trade 8 loads for 4: */\
	"vmovaps	0x80(%%rax),%%ymm5	\n\t"/* 2 of mults, 2 to read clobbered ymm2,3 values from mem */\
	" vfmadd132pd		%%ymm4 ,%%ymm0,%%ymm2 	\n\t  vfmadd132pd	%%ymm4 ,%%ymm8 ,%%ymm10	\n\t"\
	" vfmadd132pd		%%ymm4 ,%%ymm1,%%ymm3 	\n\t  vfmadd132pd	%%ymm4 ,%%ymm9 ,%%ymm11	\n\t"\
	"vfnmadd231pd	    (%%rbx),%%ymm5,%%ymm0 	\n\t vfnmadd231pd	%%ymm5 ,%%ymm12,%%ymm8 	\n\t"\
	"vfnmadd231pd	0x20(%%rbx),%%ymm5,%%ymm1 	\n\t vfnmadd231pd	%%ymm5 ,%%ymm13,%%ymm9 	\n\t"\
		"vmovaps	    (%%rsi),%%ymm4		\n\t	vmovaps	    (%%r10),%%ymm12		\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm5		\n\t	vmovaps	0x20(%%r10),%%ymm13		\n\t"\
		"movq	%[__o1],%%rax				\n\t	movq	%[__u1],%%r11			\n\t"\
		"movq	%[__o4],%%rdx				\n\t	movq	%[__u4],%%r14			\n\t"\
		"vsubpd	%%ymm3,%%ymm6,%%ymm6		\n\t	vsubpd	%%ymm11,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm2,%%ymm7,%%ymm7		\n\t	vsubpd	%%ymm10,%%ymm15,%%ymm15		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)		\n\t	vmovaps	%%ymm14,    (%%r11)		\n\t"\
		"vmovaps	%%ymm7,0x20(%%rdx)		\n\t	vmovaps	%%ymm15,0x20(%%r14)		\n\t"\
	"vmovaps	(%%rcx),%%ymm6		\n\t"/* two ... ymm6 still used once below, but now read that value from above (%%rax) write-address */\
	" vfmadd213pd	(%%rax),%%ymm6,%%ymm3 	\n\t  vfmadd132pd	%%ymm6 ,%%ymm14,%%ymm11	\n\t"\
	" vfmadd132pd	%%ymm6 ,%%ymm7,%%ymm2 	\n\t  vfmadd132pd	%%ymm6 ,%%ymm15,%%ymm10	\n\t"\
		"vmovaps	%%ymm3,    (%%rdx)		\n\t	vmovaps	%%ymm11,    (%%r14)		\n\t"\
		"vmovaps	%%ymm2,0x20(%%rax)		\n\t	vmovaps	%%ymm10,0x20(%%r11)		\n\t"\
		"movq	%[__o2],%%rbx				\n\t	movq	%[__u2],%%r12			\n\t"\
		"movq	%[__o3],%%rcx				\n\t	movq	%[__u3],%%r13			\n\t"\
		"vsubpd	%%ymm1,%%ymm4,%%ymm4		\n\t	vsubpd	%%ymm9 ,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm0,%%ymm5,%%ymm5		\n\t	vsubpd	%%ymm8 ,%%ymm13,%%ymm13		\n\t"\
		"vmovaps	%%ymm4,    (%%rbx)		\n\t	vmovaps	%%ymm12,    (%%r12)		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rcx)		\n\t	vmovaps	%%ymm13,0x20(%%r13)		\n\t"\
	" vfmadd132pd	%%ymm6 ,%%ymm4,%%ymm1 	\n\t  vfmadd132pd	%%ymm6 ,%%ymm12,%%ymm9	\n\t"\
	" vfmadd132pd	%%ymm6 ,%%ymm5,%%ymm0 	\n\t  vfmadd132pd	%%ymm6 ,%%ymm13,%%ymm8	\n\t"\
		"vmovaps	%%ymm1,    (%%rcx)		\n\t	vmovaps	%%ymm9 ,    (%%r13)		\n\t"\
		"vmovaps	%%ymm0,0x20(%%rbx)		\n\t	vmovaps	%%ymm8 ,0x20(%%r12)		\n\t"\
		:					/* outputs: none */\
		: [__cc1] "m" (Xcc1)	/* All inputs from memory addresses here */\
		 ,[__two] "m" (Xtwo)\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__j3] "m" (Xj3)\
		 ,[__j4] "m" (Xj4)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		 ,[__u3] "m" (Xu3)\
		 ,[__u4] "m" (Xu4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Here are the array-of-doubles index offsets w.r.to the __c = cc0 base-root address of the various derived sincos terms:
	// Datum	Offset	Datum	Offset
	// ------	------	------	------
	// __c1_c	0x00	__sc	0x01
	// __c1i2	0x02	__c2i2	0x03
	// __c8		0x04	__r8	0x05
	// __c4		0x06	__r4	0x07
	// __cC4	0x08	__rC	0x09
	// __c2		0x0a	__r2	0x0b
	// __cA2	0x0c	__rA	0x0d
	// __c62	0x0e	__r6	0x0f
	// __cE6	0x10	__rE	0x11
	// __c1		0x12	__r1	0x13
	// __c91	0x14	__r9	0x15
	// __c51	0x16	__r5	0x17
	// __cD5	0x18	__rD	0x19
	// __c31	0x1a	__r3	0x1b
	// __cB3	0x1c	__rB	0x1d
	// __c73	0x1e	__r7	0x1f
	// __cF7	0x20	__rF	0x21

	// Remember that for AVX2-style 3-operand FMA in AT&T syntax, the result overwrites the rightmost input!
	#define SSE2_RADIX16_DIF_FMA_OOP(Xin0,Xi1,Xi2,Xi3,Xi4, Xout0,Xout1,Xout2,Xout3,Xout4,Xout5,Xout6,Xout7,Xout8,Xout9,Xouta,Xoutb,Xoutc,Xoutd,Xoute,Xoutf, Xcc0)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i2](%%rax),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i3](%%rax),%%rdx	\n\t"/* __in0 + 3*istride */\
		"movq	%[__cc0],%%rsi 			\n\t"\
		"vbroadcastsd 0x28(%%rsi),%%ymm13 \n\t vbroadcastsd 0x38(%%rsi),%%ymm14 \n\t vbroadcastsd 0x48(%%rsi),%%ymm15 \n\t"/* load __r8,r4,rC into ymm13-15 */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			0x020(%%rcx),%%ymm5 		\n\t"/*	t04 =__A8r;					t05 =__A8i; */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			0x020(%%rax),%%ymm1 		\n\t"/*	t00 =__A0r;					t01 =__A0i; */\
		"vmovaps		%%ymm4,%%ymm6				\n\t"/*	t06 = t04; */\
		"vfnmadd231pd	%%ymm5 ,%%ymm13,%%ymm4 		\n\t	 vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm5 		\n\t"/*	FNMA231(  t05,__r8,t04);	 FMA231(  t06,__r8,t05); */\
		"vmovaps		     (%%rbx),%%ymm8			\n\t	vmovaps			0x020(%%rbx),%%ymm9 		\n\t"/*	_a =__A4r;					_b =__A4i; */\
		"vfnmadd231pd	0x020(%%rbx),%%ymm14,%%ymm8 \n\t	 vfmadd231pd	     (%%rbx),%%ymm14,%%ymm9 \n\t"/*	FNMA231(__A4i,__r4,_a );	 FMA231(__A4r,__r4,_b ); */\
		"vbroadcastsd	0x040(%%rsi),%%ymm13		\n\t	vbroadcastsd	0x020(%%rsi),%%ymm14		\n\t"/* load __cC4,c8 into pair of regs */\
		"vmovaps		     (%%rdx),%%ymm6			\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*	t06 =__ACr;					t07 =__ACi; */\
		"vfnmadd231pd	0x020(%%rdx),%%ymm15,%%ymm6 \n\t	 vfmadd231pd	     (%%rdx),%%ymm15,%%ymm7 \n\t"/*	FNMA231(__ACi,__rC,t06);	 FMA231(__ACr,__rC,t07); */\
		"vbroadcastsd	0x030(%%rsi),%%ymm15		\n\t"/* load __c4 */\
		"vmovaps		%%ymm8 ,%%ymm10				\n\t	vmovaps			%%ymm0,%%ymm2 				\n\t"/*	_c = _a;	t02 = t00; */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm8 		\n\t	 vfmadd231pd	%%ymm4 ,%%ymm14,%%ymm0 		\n\t"/*	 FMA231(t06,__cC4,_a);		 FMA231(t04,__c8,t00); */\
		"vmovaps		%%ymm9 ,%%ymm11				\n\t	vmovaps			%%ymm1 ,%%ymm3 				\n\t"/*	_d = _b;	t03 = t01; */\
		" vfmadd231pd	%%ymm7 ,%%ymm13,%%ymm9 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm14,%%ymm1 		\n\t"/*	 FMA231(t07,__cC4,_b);		 FMA231(t05,__c8,t01); */\
		"vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm10		\n\t	vfnmadd231pd	%%ymm4 ,%%ymm14,%%ymm2 		\n\t"/*	FNMA231(t06,__cC4,_c);		FNMA231(t04,__c8,t02); */\
		"vfnmadd231pd	%%ymm7 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm14,%%ymm3 		\n\t"/*	FNMA231(t07,__cC4,_d);		FNMA231(t05,__c8,t03); */\
		"vmovaps		%%ymm0 ,%%ymm4 				\n\t	vmovaps			%%ymm1 ,%%ymm5 				\n\t"/*	t04 =t00; t05 =t01; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm15,%%ymm4 		\n\t	 vfmadd231pd	%%ymm8 ,%%ymm15,%%ymm0 		\n\t"/*	FNMA231(_a ,__c4 ,t04);		 FMA231(_a ,__c4 ,t00); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm15,%%ymm5 		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm15,%%ymm1 		\n\t"/*	FNMA231(_b ,__c4 ,t05);		 FMA231(_b ,__c4 ,t01); */\
		"vmovaps		%%ymm2 ,%%ymm6 				\n\t	vmovaps			%%ymm3 ,%%ymm7 				\n\t"/*	t06 =t02;	t07 =t03; */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm6 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t"/*	 FMA231(_d ,__c4 ,t06);		FNMA231(_d ,__c4 ,t02); */\
		"vfnmadd231pd	%%ymm10,%%ymm15,%%ymm7 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t"/*	FNMA231(_c ,__c4 ,t07);		 FMA231(_c ,__c4 ,t03); */\
		"vmovaps		%%ymm4 ,     (%%rcx)		\n\t	vmovaps			%%ymm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%ymm5 ,0x020(%%rcx)		\n\t	vmovaps			%%ymm1 ,0x020(%%rax)		\n\t"\
		"vmovaps		%%ymm6 ,     (%%rdx)		\n\t	vmovaps			%%ymm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%ymm7 ,0x020(%%rdx)		\n\t	vmovaps			%%ymm3 ,0x020(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%ymm12 	\n\t"/* load __r2 into ymm12 */\
		"vbroadcastsd 0x028(%%rsi),%%ymm13 	\n\t"/* load __rA into ymm13 */\
		"vbroadcastsd 0x038(%%rsi),%%ymm14 	\n\t"/* load __r6 into ymm14 */\
		"vbroadcastsd 0x048(%%rsi),%%ymm15 	\n\t"/* load __rE into ymm15 */\
		"vmovaps (%%rax),%%ymm0  \n\t vmovaps 0x020(%%rax),%%ymm1  \n\t"/* t08 =__A2r; t09 =__A2i; */\
		"vmovaps (%%rcx),%%ymm4  \n\t vmovaps 0x020(%%rcx),%%ymm5  \n\t"/* t12 =__AAr; t13 =__AAi; */\
		"vmovaps (%%rbx),%%ymm8  \n\t vmovaps 0x020(%%rbx),%%ymm9  \n\t"/* _a  =__A6r; _b  =__A6i; */\
		"vmovaps (%%rdx),%%ymm6  \n\t vmovaps 0x020(%%rdx),%%ymm7  \n\t"/* t14 =__AEr; t15 =__AEi; */\
		"vfnmadd231pd	0x020(%%rax),%%ymm12,%%ymm0 \n\t	 vfmadd231pd	(%%rax),%%ymm12,%%ymm1  	\n\t"/* FNMA231(__A2i,__r2,t08); FMA231(__A2r,__r2,t09); */\
		"vfnmadd231pd	0x020(%%rcx),%%ymm13,%%ymm4 \n\t	 vfmadd231pd	(%%rcx),%%ymm13,%%ymm5  	\n\t"/* FNMA231(__AAi,__rA,t12); FMA231(__AAr,__rA,t13); */\
		"vbroadcastsd	0x040(%%rsi),%%ymm13		\n\t"/* load __cE6 */\
		"vfnmadd231pd	0x020(%%rbx),%%ymm14,%%ymm8 \n\t	 vfmadd231pd	(%%rbx),%%ymm14,%%ymm9  	\n\t"/* FNMA231(__A6i,__r6,_a ); FMA231(__A6r,__r6,_b ); */\
		"vbroadcastsd	0x020(%%rsi),%%ymm14		\n\t"/* load __cA2 */\
		"vfnmadd231pd	0x020(%%rdx),%%ymm15,%%ymm6 \n\t	 vfmadd231pd	(%%rdx),%%ymm15,%%ymm7  	\n\t"/* FNMA231(__AEi,__rE,t14); FMA231(__AEr,__rE,t15); */\
		"vbroadcastsd	0x030(%%rsi),%%ymm15		\n\t"/* load __c62 */\
		"vmovaps		%%ymm8 ,%%ymm10				\n\t	vmovaps			%%ymm0,%%ymm2 				\n\t"/*	_c = _a;	t10 = t08; */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm8 		\n\t	 vfmadd231pd	%%ymm4 ,%%ymm14,%%ymm0 		\n\t"/*	 FMA231(t14,__cE6,_a);		 FMA231(t12,__cA2,t08); */\
		"vmovaps		%%ymm9 ,%%ymm11				\n\t	vmovaps			%%ymm1 ,%%ymm3 				\n\t"/*	_d = _b;	t11 = t09; */\
		" vfmadd231pd	%%ymm7 ,%%ymm13,%%ymm9 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm14,%%ymm1 		\n\t"/*	 FMA231(t15,__cE6,_b);		 FMA231(t13,__cA2,t09); */\
		"vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm10		\n\t	vfnmadd231pd	%%ymm4 ,%%ymm14,%%ymm2 		\n\t"/*	FNMA231(t14,__cE6,_c);		FNMA231(t12,__cA2,t10); */\
		"vfnmadd231pd	%%ymm7 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm14,%%ymm3 		\n\t"/*	FNMA231(t15,__cE6,_d);		FNMA231(t13,__cA2,t11); */\
		"vmovaps		%%ymm0 ,%%ymm4 				\n\t	vmovaps			%%ymm1 ,%%ymm5 				\n\t"/*	t12 =t08 ;	t13 =t09; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm15,%%ymm4 		\n\t	 vfmadd231pd	%%ymm8 ,%%ymm15,%%ymm0 		\n\t"/*	FNMA231(_a,__c62,t12);		 FMA231( _a,__c62,t08); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm15,%%ymm5 		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm15,%%ymm1 		\n\t"/*	FNMA231(_b,__c62,t13);		 FMA231( _b,__c62,t09); */\
		"vmovaps		%%ymm2 ,%%ymm6 				\n\t	vmovaps			%%ymm3 ,%%ymm7 				\n\t"/*	t14 =t10;	t15 =t11; */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm6 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t"/*	 FMA231(_d,__c62,t14);		FNMA231( _d,__c62,t10); */\
		"vfnmadd231pd	%%ymm10,%%ymm15,%%ymm7 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t"/*	FNMA231(_c,__c62,t15);		 FMA231( _c,__c62,t11); */\
		"vmovaps		%%ymm4 ,     (%%rcx)		\n\t	vmovaps			%%ymm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%ymm5 ,0x020(%%rcx)		\n\t	vmovaps			%%ymm1 ,0x020(%%rax)		\n\t"\
		"vmovaps		%%ymm6 ,     (%%rdx)		\n\t	vmovaps			%%ymm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%ymm7 ,0x020(%%rdx)		\n\t	vmovaps			%%ymm3 ,0x020(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%ymm12 	\n\t"/* load __r1 into ymm12 */\
		"vbroadcastsd 0x028(%%rsi),%%ymm13 	\n\t"/* load __r9 into ymm13 */\
		"vbroadcastsd 0x038(%%rsi),%%ymm14 	\n\t"/* load __r5 into ymm14 */\
		"vbroadcastsd 0x048(%%rsi),%%ymm15 	\n\t"/* load __rD into ymm15 */\
		"vmovaps (%%rax),%%ymm0  \n\t vmovaps 0x020(%%rax),%%ymm1  \n\t"/* t16 =__A1r;	t17 =__A1i; */\
		"vmovaps (%%rcx),%%ymm4  \n\t vmovaps 0x020(%%rcx),%%ymm5  \n\t"/* t20 =__A9r;	t21 =__A9i; */\
		"vmovaps (%%rbx),%%ymm8  \n\t vmovaps 0x020(%%rbx),%%ymm9  \n\t"/* _a=  __A5r;	_b  =__A5i; */\
		"vmovaps (%%rdx),%%ymm6  \n\t vmovaps 0x020(%%rdx),%%ymm7  \n\t"/* t22 =__ADr;	t23 =__ADi; */\
		"vfnmadd231pd	0x020(%%rax),%%ymm12,%%ymm0 \n\t	 vfmadd231pd	(%%rax),%%ymm12,%%ymm1  	\n\t"/* FNMA231(__A1i,__r1,t16);	 FMA231(__A1r,__r1,t17); */\
		"vfnmadd231pd	0x020(%%rcx),%%ymm13,%%ymm4 \n\t	 vfmadd231pd	(%%rcx),%%ymm13,%%ymm5  	\n\t"/* FNMA231(__A9i,__r9,t20);	 FMA231(__A9r,__r9,t21); */\
		"vbroadcastsd	0x040(%%rsi),%%ymm13		\n\t"/* load __cD5 */\
		"vfnmadd231pd	0x020(%%rbx),%%ymm14,%%ymm8 \n\t	 vfmadd231pd	(%%rbx),%%ymm14,%%ymm9  	\n\t"/* FNMA231(__A5i,__r5,_a );	 FMA231(__A5r,__r5,_b ); */\
		"vbroadcastsd	0x020(%%rsi),%%ymm14		\n\t"/* load __c91 */\
		"vfnmadd231pd	0x020(%%rdx),%%ymm15,%%ymm6 \n\t	 vfmadd231pd	(%%rdx),%%ymm15,%%ymm7  	\n\t"/* FNMA231(__ADi,__rD,t22);	 FMA231(__ADr,__rD,t23); */\
		"vbroadcastsd	0x030(%%rsi),%%ymm15		\n\t"/* load __c51 */\
		"vmovaps		%%ymm8 ,%%ymm10				\n\t	vmovaps			%%ymm0,%%ymm2 				\n\t"/*	_c= _a;	t18= t16; */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm8 		\n\t	 vfmadd231pd	%%ymm4 ,%%ymm14,%%ymm0 		\n\t"/*	 FMA231(t22,__cD5,_a);		 FMA231(t20,__c91,t16); */\
		"vmovaps		%%ymm9 ,%%ymm11				\n\t	vmovaps			%%ymm1 ,%%ymm3 				\n\t"/*	_d= _b;	t19= t17; */\
		" vfmadd231pd	%%ymm7 ,%%ymm13,%%ymm9 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm14,%%ymm1 		\n\t"/*	 FMA231(t23,__cD5,_b);		 FMA231(t21,__c91,t17); */\
		"vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm10		\n\t	vfnmadd231pd	%%ymm4 ,%%ymm14,%%ymm2 		\n\t"/*	FNMA231(t22,__cD5,_c);		FNMA231(t20,__c91,t18); */\
		"vfnmadd231pd	%%ymm7 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm14,%%ymm3 		\n\t"/*	FNMA231(t23,__cD5,_d);		FNMA231(t21,__c91,t19); */\
		"vmovaps		%%ymm0 ,%%ymm4 				\n\t	vmovaps			%%ymm1 ,%%ymm5 				\n\t"/*	t20 =t16;	t21 =t17; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm15,%%ymm4 		\n\t	 vfmadd231pd	%%ymm8 ,%%ymm15,%%ymm0 		\n\t"/*	FNMA231(_a,__c51,t20);		 FMA231(_a,__c51,t16); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm15,%%ymm5 		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm15,%%ymm1 		\n\t"/*	FNMA231(_b,__c51,t21);		 FMA231(_b,__c51,t17); */\
		"vmovaps		%%ymm2 ,%%ymm6 				\n\t	vmovaps			%%ymm3 ,%%ymm7 				\n\t"/*	t22 =t18;	t23 =t19; */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm6 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t"/*	 FMA231(_d,__c51,t22);		FNMA231(_d,__c51,t18); */\
		"vfnmadd231pd	%%ymm10,%%ymm15,%%ymm7 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t"/*	FNMA231(_c,__c51,t23);		 FMA231(_c,__c51,t19); */\
		"vmovaps		%%ymm4 ,     (%%rcx)		\n\t	vmovaps			%%ymm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%ymm5 ,0x020(%%rcx)		\n\t	vmovaps			%%ymm1 ,0x020(%%rax)		\n\t"\
		"vmovaps		%%ymm6 ,     (%%rdx)		\n\t	vmovaps			%%ymm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%ymm7 ,0x020(%%rdx)		\n\t	vmovaps			%%ymm3 ,0x020(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%ymm12 	\n\t"/* load __r3 into ymm12 */\
		"vbroadcastsd 0x028(%%rsi),%%ymm13 	\n\t"/* load __rB into ymm13 */\
		"vbroadcastsd 0x038(%%rsi),%%ymm14 	\n\t"/* load __r7 into ymm14 */\
		"vbroadcastsd 0x048(%%rsi),%%ymm15 	\n\t"/* load __rF into ymm15 */\
		"vmovaps (%%rax),%%ymm0  \n\t vmovaps 0x020(%%rax),%%ymm1  \n\t"/* t24 =__A3r;	t25 =__A3i; */\
		"vmovaps (%%rcx),%%ymm4  \n\t vmovaps 0x020(%%rcx),%%ymm5  \n\t"/* t28 =__ABr;	t29 =__ABi; */\
		"vmovaps (%%rbx),%%ymm8  \n\t vmovaps 0x020(%%rbx),%%ymm9  \n\t"/* _a  =__A7r;	_b  =__A7i; */\
		"vmovaps (%%rdx),%%ymm6  \n\t vmovaps 0x020(%%rdx),%%ymm7  \n\t"/* t30 =__AFr;	t31 =__AFi; */\
		"vfnmadd231pd	0x020(%%rax),%%ymm12,%%ymm0 \n\t	 vfmadd231pd	(%%rax),%%ymm12,%%ymm1  	\n\t"/* FNMA231(__A3i,__r3,t24);	 FMA231(__A3r,__r3,t25); */\
		"vfnmadd231pd	0x020(%%rcx),%%ymm13,%%ymm4 \n\t	 vfmadd231pd	(%%rcx),%%ymm13,%%ymm5  	\n\t"/* FNMA231(__ABi,__rB,t28 );	 FMA231(__ABr,__rB,t29 ); */\
		"vbroadcastsd	0x040(%%rsi),%%ymm13		\n\t"/* load __cF7 */\
		"vfnmadd231pd	0x020(%%rbx),%%ymm14,%%ymm8 \n\t	 vfmadd231pd	(%%rbx),%%ymm14,%%ymm9  	\n\t"/* FNMA231(__A7i,__r7,_a);		 FMA231(__A7r,__r7,_b); */\
		"vbroadcastsd	0x020(%%rsi),%%ymm14		\n\t"/* load __cB3 */\
		"vfnmadd231pd	0x020(%%rdx),%%ymm15,%%ymm6 \n\t	 vfmadd231pd	(%%rdx),%%ymm15,%%ymm7  	\n\t"/* FNMA231(__AFi,__rF,t30 );	 FMA231(__AFr,__rF,t31 ); */\
		"vbroadcastsd	0x030(%%rsi),%%ymm15		\n\t"/* load __c73 */\
		"vmovaps		%%ymm8 ,%%ymm10				\n\t	vmovaps			%%ymm0,%%ymm2 				\n\t"/*	_c= _a;	t26= t24; */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm8 		\n\t	 vfmadd231pd	%%ymm4 ,%%ymm14,%%ymm0 		\n\t"/*	 FMA231(t30,__cF7,_a);		 FMA231(t28,__cB3,t24); */\
		"vmovaps		%%ymm9 ,%%ymm11				\n\t	vmovaps			%%ymm1 ,%%ymm3 				\n\t"/*	_d= _b;	t27= t25; */\
		" vfmadd231pd	%%ymm7 ,%%ymm13,%%ymm9 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm14,%%ymm1 		\n\t"/*	 FMA231(t31,__cF7,_b);		 FMA231(t29,__cB3,t25); */\
		"vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm10		\n\t	vfnmadd231pd	%%ymm4 ,%%ymm14,%%ymm2 		\n\t"/*	FNMA231(t30,__cF7,_c);		FNMA231(t28,__cB3,t26); */\
		"vfnmadd231pd	%%ymm7 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm14,%%ymm3 		\n\t"/*	FNMA231(t31,__cF7,_d);		FNMA231(t29,__cB3,t27); */\
		"vmovaps		%%ymm0 ,%%ymm4 				\n\t	vmovaps			%%ymm1 ,%%ymm5 				\n\t"/*	t28 =t24;	t29 =t25; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm15,%%ymm4 		\n\t	 vfmadd231pd	%%ymm8 ,%%ymm15,%%ymm0 		\n\t"/*	FNMA231(_a,__c73,t28);		 FMA231(_a,__c73,t24); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm15,%%ymm5 		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm15,%%ymm1 		\n\t"/*	FNMA231(_b,__c73,t29);		 FMA231(_b,__c73,t25); */\
		"vmovaps		%%ymm2 ,%%ymm6 				\n\t	vmovaps			%%ymm3 ,%%ymm7 				\n\t"/*	t30 =t26;	t31 =t27; */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm6 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t"/*	 FMA231(_d,__c73,t30);		FNMA231(_d,__c73,t26); */\
		"vfnmadd231pd	%%ymm10,%%ymm15,%%ymm7 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t"/*	FNMA231(_c,__c73,t31);		 FMA231(_c,__c73,t27); */\
		"vmovaps		%%ymm4 ,     (%%rcx)		\n\t	vmovaps			%%ymm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%ymm5 ,0x020(%%rcx)		\n\t	vmovaps			%%ymm1 ,0x020(%%rax)		\n\t"\
		"vmovaps		%%ymm6 ,     (%%rdx)		\n\t	vmovaps			%%ymm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%ymm7 ,0x020(%%rdx)		\n\t	vmovaps			%%ymm3 ,0x020(%%rbx)		\n\t"\
		"\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"subq	$0x0c0,%%rsi 		/* revert cc-ptr to base value */\n\t"\
		"\n\t"\
		/*...Read t0,8,16,24 from local store ... Do the 4 Im-part FMAs first, because their results needed 1st below */\
		"vbroadcastsd	0x050(%%rsi),%%ymm14		\n\t	vbroadcastsd	0x0d0(%%rsi),%%ymm15		\n\t"/* load __c2,c31 into pair of regs */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			     (%%rbx),%%ymm2 		\n\t"/*    t00;    t08; */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			     (%%rdx),%%ymm6 		\n\t"/*    t16;    t24; */\
		"vmovaps			 %%ymm0 ,%%ymm8 		\n\t	vmovaps				 %%ymm4 ,%%ymm10		\n\t"/* _a=t00; _c=t16; */\
		" vfmadd231pd	%%ymm2 ,%%ymm14,%%ymm0 		\n\t	 vfmadd231pd	%%ymm6 ,%%ymm15,%%ymm4 		\n\t"/*	 FMA231(t08,__c2 ,t00);		 FMA231(t24,__c31,t16); */\
		"vmovaps		0x020(%%rax),%%ymm1 		\n\t	vmovaps			0x020(%%rbx),%%ymm3 		\n\t"/*    t01;    t09; */\
		"vmovaps		0x020(%%rcx),%%ymm5 		\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*    t17;    t25; */\
		"vmovaps			 %%ymm1 ,%%ymm9 		\n\t	vmovaps				 %%ymm5 ,%%ymm11		\n\t"/* _b=t01; _d=t17; */\
		" vfmadd231pd	%%ymm3 ,%%ymm14,%%ymm1 		\n\t	 vfmadd231pd	%%ymm7 ,%%ymm15,%%ymm5 		\n\t"/*	 FMA231(t09,__c2 ,t01);		 FMA231(t25,__c31,t17); */\
		"vfnmadd231pd	%%ymm2 ,%%ymm14,%%ymm8 		\n\t	vfnmadd231pd	%%ymm6 ,%%ymm15,%%ymm10		\n\t"/*	FNMA231(t08,__c2 ,_a );		FNMA231(t24,__c31,_c ); */\
		"vbroadcastsd	0x090(%%rsi),%%ymm6 		\n\t"/* load __c1 */\
		"vfnmadd231pd	%%ymm3 ,%%ymm14,%%ymm9 		\n\t	vfnmadd231pd	%%ymm7 ,%%ymm15,%%ymm11		\n\t"/*	FNMA231(t09,__c2 ,_b );		FNMA231(t25,__c31,_d ); */\
		"vmovaps			 %%ymm0 ,%%ymm12		\n\t	vmovaps				 %%ymm1 ,%%ymm13		\n\t"/* _e = t00; _f = t01; */\
		" vfmadd231pd	%%ymm4 ,%%ymm6 ,%%ymm0 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm6 ,%%ymm1 		\n\t"/*	 FMA231(t16,__c1 ,t00);		 FMA231(t17,__c1 ,t01); */\
		"vfnmadd231pd	%%ymm4 ,%%ymm6 ,%%ymm12		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm6 ,%%ymm13		\n\t"/*	FNMA231(t16,__c1 ,_e );		FNMA231(t17,__c1 ,_f ); */\
		"vmovaps			 %%ymm8 ,%%ymm2 		\n\t	vmovaps				 %%ymm9 ,%%ymm3 		\n\t"/* t08 = _a ; t09 = _b; */\
		"vfnmadd231pd	%%ymm11,%%ymm6 ,%%ymm2 		\n\t	 vfmadd231pd	%%ymm10,%%ymm6 ,%%ymm3 		\n\t"/*	FNMA231(_d ,__c1 ,t08);		 FMA231(_c ,__c1 ,t09); */\
		" vfmadd231pd	%%ymm11,%%ymm6 ,%%ymm8 		\n\t	vfnmadd231pd	%%ymm10,%%ymm6 ,%%ymm9 		\n\t"/*	 FMA231(_d ,__c1 ,_a );		FNMA231(_c ,__c1 ,_b ); */\
		/* Write outputs: */\
		"movq	%[__out0],%%r10		\n\t"\
		"movq	%[__out1],%%r11		\n\t"\
		"movq	%[__out2],%%r12		\n\t"\
		"movq	%[__out3],%%r13		\n\t"\
		"vmovaps		%%ymm0 ,     (%%r10)		\n\t	vmovaps			%%ymm1 ,0x020(%%r10)		\n\t"/* __B0r= t00;		__B0i= t01; */\
		"vmovaps		%%ymm12,     (%%r11)		\n\t	vmovaps			%%ymm13,0x020(%%r11)		\n\t"/* __B1r= _e ;		__B1i= _f ; */\
		"vmovaps		%%ymm2 ,     (%%r12)		\n\t	vmovaps			%%ymm3 ,0x020(%%r12)		\n\t"/* __B2r= t08;		__B2i= t09; */\
		"vmovaps		%%ymm8 ,     (%%r13)		\n\t	vmovaps			%%ymm9 ,0x020(%%r13)		\n\t"/* __B3r= _a ;		__B3i= _b ; */\
		"\n\t"\
		/*...Block 2: t4,12,20,28 */\
		"vbroadcastsd	0x110(%%rsi),%%ymm13	\n\t"/* cc0 + 0x22 = __two; Actually holds 1.0 in AVX2 mode */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
		"vbroadcastsd	0x050(%%rsi),%%ymm14		\n\t	vbroadcastsd	0x0d0(%%rsi),%%ymm15		\n\t"/* load __c2,31 into pair of regs */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			0x020(%%rcx),%%ymm5 		\n\t"/*    t20;    t21; */\
		"vmovaps		     (%%rdx),%%ymm6 		\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*    t28;    t29; */\
		"vmovaps			 %%ymm4 ,%%ymm10 		\n\t	vmovaps				 %%ymm7 ,%%ymm11		\n\t"/* _c=t20; _d=t29; */\
		"vfnmadd231pd	%%ymm5 ,%%ymm13,%%ymm10		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm13,%%ymm4 		\n\t"/*	FNMA231(t21,1.0,_c );		 FMA231(t21,1.0,t20); */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm7 		\n\t"/*	 FMA231(t28,1.0,_d );		FNMA231(t28,1.0,t29); */\
		"vbroadcastsd	0x010(%%rsi),%%ymm13		\n\t"/* load __c1i2 */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			0x020(%%rax),%%ymm1 		\n\t"/*    t04;    t05; */\
		"vmovaps		     (%%rbx),%%ymm2 		\n\t	vmovaps			0x020(%%rbx),%%ymm3 		\n\t"/*    t12;    t13; */\
		"vmovaps			 %%ymm0 ,%%ymm8 		\n\t	vmovaps				 %%ymm1 ,%%ymm9 		\n\t"/* _a = t04; _b = t05; */\
		"vmovaps			 %%ymm10,%%ymm5 		\n\t	vmovaps				 %%ymm4 ,%%ymm12		\n\t"/* t21 = _c; _e = t20; */\
		" vfmadd231pd	%%ymm3 ,%%ymm14,%%ymm8 		\n\t	 vfmadd231pd	%%ymm11,%%ymm15,%%ymm5 		\n\t"/*	 FMA231(t13,__c2 ,_a );		 FMA231(_d ,__c31,t21); */\
		"vfnmadd231pd	%%ymm2 ,%%ymm14,%%ymm9 		\n\t	 vfmadd231pd	%%ymm7 ,%%ymm15,%%ymm4 		\n\t"/*	FNMA231(t12,__c2 ,_b );		 FMA231(t29,__c31,t20); */\
		"vfnmadd231pd	%%ymm3 ,%%ymm14,%%ymm0 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm10		\n\t"/*	FNMA231(t13,__c2 ,t04);		FNMA231(_d ,__c31,_c ); */\
		" vfmadd231pd	%%ymm2 ,%%ymm14,%%ymm1 		\n\t	vfnmadd231pd	%%ymm7 ,%%ymm15,%%ymm12		\n\t"/*	 FMA231(t12,__c2 ,t05);		FNMA231(t29,__c31,_e ); */\
		"vmovaps			 %%ymm8 ,%%ymm2 		\n\t	vmovaps				 %%ymm9 ,%%ymm3 		\n\t"/* t12 = _a; t13 = _b; */\
		"vfnmadd231pd	%%ymm4 ,%%ymm13,%%ymm2 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm13,%%ymm3 		\n\t"/*	FNMA231(t20,__c1i2,t12);	 FMA231(t21,__c1i2,t13); */\
		" vfmadd231pd	%%ymm4 ,%%ymm13,%%ymm8 		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm13,%%ymm9 		\n\t"/*	 FMA231(t20,__c1i2,_a );	FNMA231(t21,__c1i2,_b ); */\
		"vmovaps			 %%ymm0 ,%%ymm11		\n\t	vmovaps				 %%ymm1 ,%%ymm7 		\n\t"/* _d = t04; t29 = t05; */\
		" vfmadd231pd	%%ymm10,%%ymm13,%%ymm0 		\n\t	 vfmadd231pd	%%ymm12,%%ymm13,%%ymm1 		\n\t"/*	 FMA231(_c ,__c1i2,t04);	 FMA231(_e ,__c1i2,t05); */\
		"vfnmadd231pd	%%ymm10,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm12,%%ymm13,%%ymm7 		\n\t"/*	FNMA231(_c ,__c1i2,_d );	FNMA231(_e ,__c1i2,t29); */\
		/* Write outputs: */\
		"movq	%[__out4],%%r10		\n\t"\
		"movq	%[__out5],%%r11		\n\t"\
		"movq	%[__out6],%%r12		\n\t"\
		"movq	%[__out7],%%r13		\n\t"\
		"vmovaps		%%ymm2 ,     (%%r12)		\n\t	vmovaps			%%ymm3 ,0x020(%%r12)		\n\t"/* __B6r= t12;		__B6i= t13; */\
		"vmovaps		%%ymm8 ,     (%%r13)		\n\t	vmovaps			%%ymm9 ,0x020(%%r13)		\n\t"/* __B7r= _a ;		__B7i= _b ; */\
		"vmovaps		%%ymm0 ,     (%%r10)		\n\t	vmovaps			%%ymm1 ,0x020(%%r10)		\n\t"/* __B4r= t04;		__B4i= t05; */\
		"vmovaps		%%ymm11,     (%%r11)		\n\t	vmovaps			%%ymm7 ,0x020(%%r11)		\n\t"/* __B5r= _d ;		__B5i= t29; */\
		"\n\t"\
		/*...Block 1: t2,10,18,26 */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
		"vmovaps		     (%%rbx),%%ymm2 		\n\t	vmovaps			0x020(%%rbx),%%ymm3 		\n\t"/*    t10;    t11; */\
		"vbroadcastsd	0x008(%%rsi),%%ymm15		\n\t"/* load __sc  */\
		"vbroadcastsd	0x0d0(%%rsi),%%ymm14		\n\t"/* load __c31 */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			0x020(%%rcx),%%ymm5 		\n\t"/*    t18;    t19; */\
		"vmovaps		     (%%rdx),%%ymm6 		\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*    t26;    t27; */\
		"vsubpd		%%ymm3 ,%%ymm2 ,%%ymm12 		\n\t"/* _e = t10-t11; */\
		"vaddpd		%%ymm2 ,%%ymm3 ,%%ymm13			\n\t"/* _f = t10+t11; */\
		"vmovaps			 %%ymm4 ,%%ymm10 		\n\t	vmovaps				 %%ymm7 ,%%ymm8 		\n\t"/* _c = t18; _a = t27; */\
		"vfnmadd231pd	%%ymm5 ,%%ymm15,%%ymm10		\n\t	 vfmsub231pd	%%ymm6 ,%%ymm15,%%ymm8 		\n\t"/*	FNMA231(t19,__sc,_c );		 FMS231(t26,__sc,_a ); */\
		"vmovaps			 %%ymm5 ,%%ymm11 		\n\t	vmovaps				 %%ymm6 ,%%ymm9 		\n\t"/* _d = t19; _b = t26; */\
		"vbroadcastsd	0x018(%%rsi),%%ymm6 		\n\t"/* load __c2i2 */\
		" vfmadd231pd	%%ymm4 ,%%ymm15,%%ymm11		\n\t	 vfmadd231pd	%%ymm7 ,%%ymm15,%%ymm9 		\n\t"/*	 FMA231(t18,__sc,_d );		 FMA231(t27,__sc,_b ); */\
		"vbroadcastsd	(%%rsi),%%ymm15				\n\t"/* load __c1_c */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			0x020(%%rax),%%ymm1 		\n\t"/*    t02;    t03; */\
		"vmovaps			 %%ymm10,%%ymm4 		\n\t	vmovaps				 %%ymm0 ,%%ymm2 		\n\t"/* t18 = _c;	t10 = t02; */\
		" vfmadd231pd	%%ymm8 ,%%ymm14,%%ymm4 		\n\t	 vfmadd231pd	%%ymm12,%%ymm6 ,%%ymm0 		\n\t"/*	 FMA231(_a ,__c31,t18);		 FMA231(_e ,__c2i2,t02); */\
		"vmovaps			 %%ymm11,%%ymm5 		\n\t	vmovaps				 %%ymm1 ,%%ymm3 		\n\t"/* t19 = _d;	t11 = t03; */\
		" vfmadd231pd	%%ymm9 ,%%ymm14,%%ymm5 		\n\t	 vfmadd231pd	%%ymm13,%%ymm6 ,%%ymm1 		\n\t"/*	 FMA231(_b ,__c31,t19);		 FMA231(_f ,__c2i2,t03); */\
		"vfnmadd231pd	%%ymm8 ,%%ymm14,%%ymm10		\n\t	vfnmadd231pd	%%ymm12,%%ymm6 ,%%ymm2 		\n\t"/*	FNMA231(_a ,__c31,_c );		FNMA231(_e ,__c2i2,t10); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm14,%%ymm11		\n\t	vfnmadd231pd	%%ymm13,%%ymm6 ,%%ymm3 		\n\t"/*	FNMA231(_b ,__c31,_d );		FNMA231(_f ,__c2i2,t11); */\
		"vmovaps			 %%ymm0 ,%%ymm8 		\n\t	vmovaps				 %%ymm1 ,%%ymm9 		\n\t"/* _a = t02; _b = t03; */\
		" vfmadd231pd	%%ymm4 ,%%ymm15,%%ymm0 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm15,%%ymm1 		\n\t"/*	 FMA231(t18,__c1_c,t02);	 FMA231(t19,__c1_c,t03); */\
		"vfnmadd231pd	%%ymm4 ,%%ymm15,%%ymm8 		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm15,%%ymm9 		\n\t"/*	FNMA231(t18,__c1_c,_a );	FNMA231(t19,__c1_c,_b ); */\
		"vmovaps			 %%ymm2 ,%%ymm12		\n\t	vmovaps				 %%ymm3 ,%%ymm13		\n\t"/* _e = t10; _f = t11; */\
		"vfnmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t"/*	FNMA231(_d ,__c1_c,t10);	 FMA231(_c ,__c1_c,t11); */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm12		\n\t	vfnmadd231pd	%%ymm10,%%ymm15,%%ymm13		\n\t"/*	 FMA231(_d ,__c1_c,_e );	FNMA231(_c ,__c1_c,_f ); */\
		/* Write outputs: */\
		"movq	%[__out8],%%r10		\n\t"\
		"movq	%[__out9],%%r11		\n\t"\
		"movq	%[__outa],%%r12		\n\t"\
		"movq	%[__outb],%%r13		\n\t"\
		"vmovaps		%%ymm0 ,     (%%r10)		\n\t	vmovaps			%%ymm1 ,0x020(%%r10)		\n\t"/* __B8r= t02;		__B8i= t03; */\
		"vmovaps		%%ymm8 ,     (%%r11)		\n\t	vmovaps			%%ymm9 ,0x020(%%r11)		\n\t"/* __B9r= _a ;		__B9i= _b ; */\
		"vmovaps		%%ymm2 ,     (%%r12)		\n\t	vmovaps			%%ymm3 ,0x020(%%r12)		\n\t"/* __BAr= t10;		__BAi= t11; */\
		"vmovaps		%%ymm12,     (%%r13)		\n\t	vmovaps			%%ymm13,0x020(%%r13)		\n\t"/* __BBr= _e ;		__BBi= _f ; */\
		"\n\t"\
		/*...Block 3: t6,14,22,30 */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
		"vmovaps		     (%%rbx),%%ymm2 		\n\t	vmovaps			0x020(%%rbx),%%ymm3 		\n\t"/*    t14;    t15; */\
		"vbroadcastsd	0x008(%%rsi),%%ymm15		\n\t"/* load __sc  */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			0x020(%%rcx),%%ymm5 		\n\t"/*    t22;    t23; */\
		"vmovaps		     (%%rdx),%%ymm6 		\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*    t30;    t31; */\
		"vaddpd		%%ymm2 ,%%ymm3 ,%%ymm10 		\n\t"/* _c = t14+t15; */\
		"vsubpd		%%ymm2 ,%%ymm3 ,%%ymm11			\n\t"/* _d = t15-t14; */\
		"vbroadcastsd	0x0d0(%%rsi),%%ymm14		\n\t"/* load __c31 */\
		"vmovaps			 %%ymm5 ,%%ymm12 		\n\t	vmovaps				 %%ymm4 ,%%ymm13		\n\t"/* _e = t23; _f = t22;*/\
		" vfmsub231pd	%%ymm4 ,%%ymm15,%%ymm12		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm15,%%ymm13		\n\t"/*	 FMS231(t22,__sc,_e );		 FMA231(t23,__sc,_f );*/\
		"vmovaps			 %%ymm6 ,%%ymm8  		\n\t	vmovaps				 %%ymm7 ,%%ymm9 		\n\t"/* _a = t30; _b = t31; */\
		"vfnmadd231pd	%%ymm7 ,%%ymm15,%%ymm8 		\n\t	 vfmadd231pd	%%ymm6 ,%%ymm15,%%ymm9 		\n\t"/*	FNMA231(t31,__sc,_a );		 FMA231(t30,__sc,_b );*/\
		"vbroadcastsd	0x018(%%rsi),%%ymm6 		\n\t"/* load __c2i2 */\
		"vbroadcastsd	(%%rsi),%%ymm15				\n\t"/* load __c1_c */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			0x020(%%rax),%%ymm1 		\n\t"/*    t06;    t07; */\
		"vmovaps			 %%ymm1 ,%%ymm3 		\n\t	vmovaps				 %%ymm0 ,%%ymm2 		\n\t"/* t15= t07;	t14= t06; */\
		"vfnmadd231pd	%%ymm11,%%ymm6 ,%%ymm1 		\n\t	vfnmadd231pd	%%ymm10,%%ymm6 ,%%ymm0 		\n\t"/*	FNMA231(_d ,__c2i2,t07);	FNMA231(_c ,__c2i2,t06); */\
		"vmovaps			 %%ymm12,%%ymm4 		\n\t	vmovaps				 %%ymm13,%%ymm5 		\n\t"/* t22= _e; t23= _f; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm14,%%ymm4 		\n\t	vfnmadd231pd	%%ymm9 ,%%ymm14,%%ymm5 		\n\t"/*	FNMA231(_a ,__c31 ,t22);	FNMA231(_b ,__c31 ,t23); */\
		" vfmadd231pd	%%ymm8 ,%%ymm14,%%ymm12		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm14,%%ymm13		\n\t"/*	 FMA231(_a ,__c31 ,_e );	 FMA231(_b ,__c31 ,_f ); */\
		" vfmadd231pd	%%ymm10,%%ymm6 ,%%ymm2 		\n\t	 vfmadd231pd	%%ymm11,%%ymm6 ,%%ymm3 		\n\t"/*	 FMA231(_c ,__c2i2,t14);	 FMA231(_d ,__c2i2,t15); */\
		"vmovaps			 %%ymm0 ,%%ymm8 		\n\t	vmovaps				 %%ymm1 ,%%ymm9 		\n\t"/* _a = t06; _b = t07; */\
		" vfmadd231pd	%%ymm4 ,%%ymm15,%%ymm0 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm15,%%ymm1 		\n\t"/*	 FMA231(t22,__c1_c,t06);	 FMA231(t23,__c1_c,t07); */\
		"vfnmadd231pd	%%ymm4 ,%%ymm15,%%ymm8 		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm15,%%ymm9 		\n\t"/*	FNMA231(t22,__c1_c,_a );	FNMA231(t23,__c1_c,_b ); */\
		"vmovaps			 %%ymm2 ,%%ymm10		\n\t	vmovaps				 %%ymm3 ,%%ymm11		\n\t"/* _c = t14; _d = t15; */\
		"vfnmadd231pd	%%ymm13,%%ymm15,%%ymm2 		\n\t	 vfmadd231pd	%%ymm12,%%ymm15,%%ymm3 		\n\t"/*	FNMA231(_f ,__c1_c,t14);	 FMA231(_e ,__c1_c,t15); */\
		" vfmadd231pd	%%ymm13,%%ymm15,%%ymm10		\n\t	vfnmadd231pd	%%ymm12,%%ymm15,%%ymm11		\n\t"/*	 FMA231(_f ,__c1_c,_c );	FNMA231(_e ,__c1_c,_d ); */\
		/* Write outputs: */\
		"movq	%[__outc],%%r10		\n\t"\
		"movq	%[__outd],%%r11		\n\t"\
		"movq	%[__oute],%%r12		\n\t"\
		"movq	%[__outf],%%r13		\n\t"\
		"vmovaps		%%ymm0 ,     (%%r10)		\n\t	vmovaps			%%ymm1 ,0x020(%%r10)		\n\t"/* __BCr= t06;		__BCi= t07; */\
		"vmovaps		%%ymm8 ,     (%%r11)		\n\t	vmovaps			%%ymm9 ,0x020(%%r11)		\n\t"/* __BDr= _a ;		__BDi= _b ; */\
		"vmovaps		%%ymm2 ,     (%%r12)		\n\t	vmovaps			%%ymm3 ,0x020(%%r12)		\n\t"/* __BEr= t14;		__BEi= t15; */\
		"vmovaps		%%ymm10,     (%%r13)		\n\t	vmovaps			%%ymm11,0x020(%%r13)		\n\t"/* __BFr= _c ;		__BFi= _d ; */\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		 ,[__out4] "m" (Xout4)\
		 ,[__out5] "m" (Xout5)\
		 ,[__out6] "m" (Xout6)\
		 ,[__out7] "m" (Xout7)\
		 ,[__out8] "m" (Xout8)\
		 ,[__out9] "m" (Xout9)\
		 ,[__outa] "m" (Xouta)\
		 ,[__outb] "m" (Xoutb)\
		 ,[__outc] "m" (Xoutc)\
		 ,[__outd] "m" (Xoutd)\
		 ,[__oute] "m" (Xoute)\
		 ,[__outf] "m" (Xoutf)\
		 ,[__cc0] "m" (Xcc0)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX16_DIT_FMA_OOP(Xin0,Xi1,Xi2,Xi3,Xi4, Xout0,Xo1,Xo2,Xo3,Xo4, Xcc0)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i2](%%rax),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i3](%%rax),%%rdx	\n\t"/* __in0 + 3*istride */\
		"movq	%[__cc0],%%rsi 			\n\t"\
		"vbroadcastsd 0x28(%%rsi),%%ymm13 \n\t vbroadcastsd 0x38(%%rsi),%%ymm14 \n\t vbroadcastsd 0x48(%%rsi),%%ymm15 \n\t"/* load __r8,r4,rC into ymm13-15 */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			0x020(%%rcx),%%ymm5 		\n\t"/*	t04 =__A8r;					t05 =__A8i; */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			0x020(%%rax),%%ymm1 		\n\t"/*	t00 =__A0r;					t01 =__A0i; */\
		"vmovaps		%%ymm4,%%ymm6				\n\t"/*	t06 = t04; */\
		"vfmadd231pd	%%ymm5 ,%%ymm13,%%ymm4 		\n\t	vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm5 		\n\t"/*	FNMA231(  t05,__r8,t04);	 FMA231(  t06,__r8,t05); */\
		"vmovaps		     (%%rbx),%%ymm8			\n\t	vmovaps			0x020(%%rbx),%%ymm9 		\n\t"/*	_a =__A4r;					_b =__A4i; */\
		"vfmadd231pd	0x020(%%rbx),%%ymm14,%%ymm8 \n\t	vfnmadd231pd	     (%%rbx),%%ymm14,%%ymm9 \n\t"/*	FNMA231(__A4i,__r4,_a );	 FMA231(__A4r,__r4,_b ); */\
		"vbroadcastsd	0x040(%%rsi),%%ymm13		\n\t	vbroadcastsd	0x020(%%rsi),%%ymm14		\n\t"/* load __cC4,c8 into pair of regs */\
		"vmovaps		     (%%rdx),%%ymm6			\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*	t06 =__ACr;					t07 =__ACi; */\
		"vfmadd231pd	0x020(%%rdx),%%ymm15,%%ymm6 \n\t	vfnmadd231pd	     (%%rdx),%%ymm15,%%ymm7 \n\t"/*	FNMA231(__ACi,__rC,t06);	 FMA231(__ACr,__rC,t07); */\
		"vbroadcastsd	0x030(%%rsi),%%ymm15		\n\t"/* load __c4 */\
		"vmovaps		%%ymm8 ,%%ymm10				\n\t	vmovaps			%%ymm0,%%ymm2 				\n\t"/*	_c = _a;	t02 = t00; */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm8 		\n\t	 vfmadd231pd	%%ymm4 ,%%ymm14,%%ymm0 		\n\t"/*	 FMA231(t06,__cC4,_a);		 FMA231(t04,__c8,t00); */\
		"vmovaps		%%ymm9 ,%%ymm11				\n\t	vmovaps			%%ymm1 ,%%ymm3 				\n\t"/*	_d = _b;	t03 = t01; */\
		" vfmadd231pd	%%ymm7 ,%%ymm13,%%ymm9 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm14,%%ymm1 		\n\t"/*	 FMA231(t07,__cC4,_b);		 FMA231(t05,__c8,t01); */\
		"vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm10		\n\t	vfnmadd231pd	%%ymm4 ,%%ymm14,%%ymm2 		\n\t"/*	FNMA231(t06,__cC4,_c);		FNMA231(t04,__c8,t02); */\
		"vfnmadd231pd	%%ymm7 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm14,%%ymm3 		\n\t"/*	FNMA231(t07,__cC4,_d);		FNMA231(t05,__c8,t03); */\
		"vmovaps		%%ymm0 ,%%ymm4 				\n\t	vmovaps			%%ymm1 ,%%ymm5 				\n\t"/*	t04 =t00; t05 =t01; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm15,%%ymm4 		\n\t	 vfmadd231pd	%%ymm8 ,%%ymm15,%%ymm0 		\n\t"/*	FNMA231(_a ,__c4 ,t04);		 FMA231(_a ,__c4 ,t00); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm15,%%ymm5 		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm15,%%ymm1 		\n\t"/*	FNMA231(_b ,__c4 ,t05);		 FMA231(_b ,__c4 ,t01); */\
		"vmovaps		%%ymm2 ,%%ymm6 				\n\t	vmovaps			%%ymm3 ,%%ymm7 				\n\t"/*	t06 =t02;	t07 =t03; */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm6 		\n\t"/*	 FMA231(_d ,__c4 ,t06);		FNMA231(_d ,__c4 ,t02); */\
		"vfnmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm7 		\n\t"/*	FNMA231(_c ,__c4 ,t07);		 FMA231(_c ,__c4 ,t03); */\
		"vmovaps		%%ymm4 ,     (%%rcx)		\n\t	vmovaps			%%ymm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%ymm5 ,0x020(%%rcx)		\n\t	vmovaps			%%ymm1 ,0x020(%%rax)		\n\t"\
		"vmovaps		%%ymm6 ,     (%%rdx)		\n\t	vmovaps			%%ymm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%ymm7 ,0x020(%%rdx)		\n\t	vmovaps			%%ymm3 ,0x020(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%ymm12 	\n\t"/* load __r2 into ymm12 */\
		"vbroadcastsd 0x028(%%rsi),%%ymm13 	\n\t"/* load __rA into ymm13 */\
		"vbroadcastsd 0x038(%%rsi),%%ymm14 	\n\t"/* load __r6 into ymm14 */\
		"vbroadcastsd 0x048(%%rsi),%%ymm15 	\n\t"/* load __rE into ymm15 */\
		"vmovaps (%%rax),%%ymm0  \n\t vmovaps 0x020(%%rax),%%ymm1  \n\t"/* t08 =__A2r; t09 =__A2i; */\
		"vmovaps (%%rcx),%%ymm4  \n\t vmovaps 0x020(%%rcx),%%ymm5  \n\t"/* t12 =__AAr; t13 =__AAi; */\
		"vmovaps (%%rbx),%%ymm8  \n\t vmovaps 0x020(%%rbx),%%ymm9  \n\t"/* _a  =__A6r; _b  =__A6i; */\
		"vmovaps (%%rdx),%%ymm6  \n\t vmovaps 0x020(%%rdx),%%ymm7  \n\t"/* t14 =__AEr; t15 =__AEi; */\
		"vfmadd231pd	0x020(%%rax),%%ymm12,%%ymm0 \n\t	vfnmadd231pd	(%%rax),%%ymm12,%%ymm1  	\n\t"/* FNMA231(__A2i,__r2,t08); FMA231(__A2r,__r2,t09); */\
		"vfmadd231pd	0x020(%%rcx),%%ymm13,%%ymm4 \n\t	vfnmadd231pd	(%%rcx),%%ymm13,%%ymm5  	\n\t"/* FNMA231(__AAi,__rA,t12); FMA231(__AAr,__rA,t13); */\
		"vbroadcastsd	0x040(%%rsi),%%ymm13		\n\t"/* load __cE6 */\
		"vfmadd231pd	0x020(%%rbx),%%ymm14,%%ymm8 \n\t	vfnmadd231pd	(%%rbx),%%ymm14,%%ymm9  	\n\t"/* FNMA231(__A6i,__r6,_a ); FMA231(__A6r,__r6,_b ); */\
		"vbroadcastsd	0x020(%%rsi),%%ymm14		\n\t"/* load __cA2 */\
		"vfmadd231pd	0x020(%%rdx),%%ymm15,%%ymm6 \n\t	vfnmadd231pd	(%%rdx),%%ymm15,%%ymm7  	\n\t"/* FNMA231(__AEi,__rE,t14); FMA231(__AEr,__rE,t15); */\
		"vbroadcastsd	0x030(%%rsi),%%ymm15		\n\t"/* load __c62 */\
		"vmovaps		%%ymm8 ,%%ymm10				\n\t	vmovaps			%%ymm0,%%ymm2 				\n\t"/*	_c = _a;	t10 = t08; */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm8 		\n\t	 vfmadd231pd	%%ymm4 ,%%ymm14,%%ymm0 		\n\t"/*	 FMA231(t14,__cE6,_a);		 FMA231(t12,__cA2,t08); */\
		"vmovaps		%%ymm9 ,%%ymm11				\n\t	vmovaps			%%ymm1 ,%%ymm3 				\n\t"/*	_d = _b;	t11 = t09; */\
		" vfmadd231pd	%%ymm7 ,%%ymm13,%%ymm9 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm14,%%ymm1 		\n\t"/*	 FMA231(t15,__cE6,_b);		 FMA231(t13,__cA2,t09); */\
		"vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm10		\n\t	vfnmadd231pd	%%ymm4 ,%%ymm14,%%ymm2 		\n\t"/*	FNMA231(t14,__cE6,_c);		FNMA231(t12,__cA2,t10); */\
		"vfnmadd231pd	%%ymm7 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm14,%%ymm3 		\n\t"/*	FNMA231(t15,__cE6,_d);		FNMA231(t13,__cA2,t11); */\
		"vmovaps		%%ymm0 ,%%ymm4 				\n\t	vmovaps			%%ymm1 ,%%ymm5 				\n\t"/*	t12 =t08 ;	t13 =t09; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm15,%%ymm4 		\n\t	 vfmadd231pd	%%ymm8 ,%%ymm15,%%ymm0 		\n\t"/*	FNMA231(_a,__c62,t12);		 FMA231( _a,__c62,t08); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm15,%%ymm5 		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm15,%%ymm1 		\n\t"/*	FNMA231(_b,__c62,t13);		 FMA231( _b,__c62,t09); */\
		"vmovaps		%%ymm2 ,%%ymm6 				\n\t	vmovaps			%%ymm3 ,%%ymm7 				\n\t"/*	t14 =t10;	t15 =t11; */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm6 		\n\t"/*	 FMA231(_d,__c62,t14);		FNMA231( _d,__c62,t10); */\
		"vfnmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm7 		\n\t"/*	FNMA231(_c,__c62,t15);		 FMA231( _c,__c62,t11); */\
		"vmovaps		%%ymm4 ,     (%%rcx)		\n\t	vmovaps			%%ymm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%ymm5 ,0x020(%%rcx)		\n\t	vmovaps			%%ymm1 ,0x020(%%rax)		\n\t"\
		"vmovaps		%%ymm6 ,     (%%rdx)		\n\t	vmovaps			%%ymm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%ymm7 ,0x020(%%rdx)		\n\t	vmovaps			%%ymm3 ,0x020(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%ymm12 	\n\t"/* load __r1 into ymm12 */\
		"vbroadcastsd 0x028(%%rsi),%%ymm13 	\n\t"/* load __r9 into ymm13 */\
		"vbroadcastsd 0x038(%%rsi),%%ymm14 	\n\t"/* load __r5 into ymm14 */\
		"vbroadcastsd 0x048(%%rsi),%%ymm15 	\n\t"/* load __rD into ymm15 */\
		"vmovaps (%%rax),%%ymm0  \n\t vmovaps 0x020(%%rax),%%ymm1  \n\t"/* t16 =__A1r;	t17 =__A1i; */\
		"vmovaps (%%rcx),%%ymm4  \n\t vmovaps 0x020(%%rcx),%%ymm5  \n\t"/* t20 =__A9r;	t21 =__A9i; */\
		"vmovaps (%%rbx),%%ymm8  \n\t vmovaps 0x020(%%rbx),%%ymm9  \n\t"/* _a=  __A5r;	_b  =__A5i; */\
		"vmovaps (%%rdx),%%ymm6  \n\t vmovaps 0x020(%%rdx),%%ymm7  \n\t"/* t22 =__ADr;	t23 =__ADi; */\
		"vfmadd231pd	0x020(%%rax),%%ymm12,%%ymm0 \n\t	vfnmadd231pd	(%%rax),%%ymm12,%%ymm1  	\n\t"/* FNMA231(__A1i,__r1,t16);	 FMA231(__A1r,__r1,t17); */\
		"vfmadd231pd	0x020(%%rcx),%%ymm13,%%ymm4 \n\t	vfnmadd231pd	(%%rcx),%%ymm13,%%ymm5  	\n\t"/* FNMA231(__A9i,__r9,t20);	 FMA231(__A9r,__r9,t21); */\
		"vbroadcastsd	0x040(%%rsi),%%ymm13		\n\t"/* load __cD5 */\
		"vfmadd231pd	0x020(%%rbx),%%ymm14,%%ymm8 \n\t	vfnmadd231pd	(%%rbx),%%ymm14,%%ymm9  	\n\t"/* FNMA231(__A5i,__r5,_a );	 FMA231(__A5r,__r5,_b ); */\
		"vbroadcastsd	0x020(%%rsi),%%ymm14		\n\t"/* load __c91 */\
		"vfmadd231pd	0x020(%%rdx),%%ymm15,%%ymm6 \n\t	vfnmadd231pd	(%%rdx),%%ymm15,%%ymm7  	\n\t"/* FNMA231(__ADi,__rD,t22);	 FMA231(__ADr,__rD,t23); */\
		"vbroadcastsd	0x030(%%rsi),%%ymm15		\n\t"/* load __c51 */\
		"vmovaps		%%ymm8 ,%%ymm10				\n\t	vmovaps			%%ymm0,%%ymm2 				\n\t"/*	_c= _a;	t18= t16; */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm8 		\n\t	 vfmadd231pd	%%ymm4 ,%%ymm14,%%ymm0 		\n\t"/*	 FMA231(t22,__cD5,_a);		 FMA231(t20,__c91,t16); */\
		"vmovaps		%%ymm9 ,%%ymm11				\n\t	vmovaps			%%ymm1 ,%%ymm3 				\n\t"/*	_d= _b;	t19= t17; */\
		" vfmadd231pd	%%ymm7 ,%%ymm13,%%ymm9 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm14,%%ymm1 		\n\t"/*	 FMA231(t23,__cD5,_b);		 FMA231(t21,__c91,t17); */\
		"vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm10		\n\t	vfnmadd231pd	%%ymm4 ,%%ymm14,%%ymm2 		\n\t"/*	FNMA231(t22,__cD5,_c);		FNMA231(t20,__c91,t18); */\
		"vfnmadd231pd	%%ymm7 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm14,%%ymm3 		\n\t"/*	FNMA231(t23,__cD5,_d);		FNMA231(t21,__c91,t19); */\
		"vmovaps		%%ymm0 ,%%ymm4 				\n\t	vmovaps			%%ymm1 ,%%ymm5 				\n\t"/*	t20 =t16;	t21 =t17; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm15,%%ymm4 		\n\t	 vfmadd231pd	%%ymm8 ,%%ymm15,%%ymm0 		\n\t"/*	FNMA231(_a,__c51,t20);		 FMA231(_a,__c51,t16); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm15,%%ymm5 		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm15,%%ymm1 		\n\t"/*	FNMA231(_b,__c51,t21);		 FMA231(_b,__c51,t17); */\
		"vmovaps		%%ymm2 ,%%ymm6 				\n\t	vmovaps			%%ymm3 ,%%ymm7 				\n\t"/*	t22 =t18;	t23 =t19; */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm6 		\n\t"/*	 FMA231(_d,__c51,t22);		FNMA231(_d,__c51,t18); */\
		"vfnmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm7 		\n\t"/*	FNMA231(_c,__c51,t23);		 FMA231(_c,__c51,t19); */\
		"vmovaps		%%ymm4 ,     (%%rcx)		\n\t	vmovaps			%%ymm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%ymm5 ,0x020(%%rcx)		\n\t	vmovaps			%%ymm1 ,0x020(%%rax)		\n\t"\
		"vmovaps		%%ymm6 ,     (%%rdx)		\n\t	vmovaps			%%ymm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%ymm7 ,0x020(%%rdx)		\n\t	vmovaps			%%ymm3 ,0x020(%%rbx)		\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"addq	$0x040,%%rsi 		/* cc += 8 */\n\t"\
		"vbroadcastsd 0x018(%%rsi),%%ymm12 	\n\t"/* load __r3 into ymm12 */\
		"vbroadcastsd 0x028(%%rsi),%%ymm13 	\n\t"/* load __rB into ymm13 */\
		"vbroadcastsd 0x038(%%rsi),%%ymm14 	\n\t"/* load __r7 into ymm14 */\
		"vbroadcastsd 0x048(%%rsi),%%ymm15 	\n\t"/* load __rF into ymm15 */\
		"vmovaps (%%rax),%%ymm0  \n\t vmovaps 0x020(%%rax),%%ymm1  \n\t"/* t24 =__A3r;	t25 =__A3i; */\
		"vmovaps (%%rcx),%%ymm4  \n\t vmovaps 0x020(%%rcx),%%ymm5  \n\t"/* t28 =__ABr;	t29 =__ABi; */\
		"vmovaps (%%rbx),%%ymm8  \n\t vmovaps 0x020(%%rbx),%%ymm9  \n\t"/* _a  =__A7r;	_b  =__A7i; */\
		"vmovaps (%%rdx),%%ymm6  \n\t vmovaps 0x020(%%rdx),%%ymm7  \n\t"/* t30 =__AFr;	t31 =__AFi; */\
		"vfmadd231pd	0x020(%%rax),%%ymm12,%%ymm0 \n\t	vfnmadd231pd	(%%rax),%%ymm12,%%ymm1  	\n\t"/* FNMA231(__A3i,__r3,t24);	 FMA231(__A3r,__r3,t25); */\
		"vfmadd231pd	0x020(%%rcx),%%ymm13,%%ymm4 \n\t	vfnmadd231pd	(%%rcx),%%ymm13,%%ymm5  	\n\t"/* FNMA231(__ABi,__rB,t28 );	 FMA231(__ABr,__rB,t29 ); */\
		"vbroadcastsd	0x040(%%rsi),%%ymm13		\n\t"/* load __cF7 */\
		"vfmadd231pd	0x020(%%rbx),%%ymm14,%%ymm8 \n\t	vfnmadd231pd	(%%rbx),%%ymm14,%%ymm9  	\n\t"/* FNMA231(__A7i,__r7,_a);		 FMA231(__A7r,__r7,_b); */\
		"vbroadcastsd	0x020(%%rsi),%%ymm14		\n\t"/* load __cB3 */\
		"vfmadd231pd	0x020(%%rdx),%%ymm15,%%ymm6 \n\t	vfnmadd231pd	(%%rdx),%%ymm15,%%ymm7  	\n\t"/* FNMA231(__AFi,__rF,t30 );	 FMA231(__AFr,__rF,t31 ); */\
		"vbroadcastsd	0x030(%%rsi),%%ymm15		\n\t"/* load __c73 */\
		"vmovaps		%%ymm8 ,%%ymm10				\n\t	vmovaps			%%ymm0,%%ymm2 				\n\t"/*	_c= _a;	t26= t24; */\
		" vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm8 		\n\t	 vfmadd231pd	%%ymm4 ,%%ymm14,%%ymm0 		\n\t"/*	 FMA231(t30,__cF7,_a);		 FMA231(t28,__cB3,t24); */\
		"vmovaps		%%ymm9 ,%%ymm11				\n\t	vmovaps			%%ymm1 ,%%ymm3 				\n\t"/*	_d= _b;	t27= t25; */\
		" vfmadd231pd	%%ymm7 ,%%ymm13,%%ymm9 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm14,%%ymm1 		\n\t"/*	 FMA231(t31,__cF7,_b);		 FMA231(t29,__cB3,t25); */\
		"vfnmadd231pd	%%ymm6 ,%%ymm13,%%ymm10		\n\t	vfnmadd231pd	%%ymm4 ,%%ymm14,%%ymm2 		\n\t"/*	FNMA231(t30,__cF7,_c);		FNMA231(t28,__cB3,t26); */\
		"vfnmadd231pd	%%ymm7 ,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm14,%%ymm3 		\n\t"/*	FNMA231(t31,__cF7,_d);		FNMA231(t29,__cB3,t27); */\
		"vmovaps		%%ymm0 ,%%ymm4 				\n\t	vmovaps			%%ymm1 ,%%ymm5 				\n\t"/*	t28 =t24;	t29 =t25; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm15,%%ymm4 		\n\t	 vfmadd231pd	%%ymm8 ,%%ymm15,%%ymm0 		\n\t"/*	FNMA231(_a,__c73,t28);		 FMA231(_a,__c73,t24); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm15,%%ymm5 		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm15,%%ymm1 		\n\t"/*	FNMA231(_b,__c73,t29);		 FMA231(_b,__c73,t25); */\
		"vmovaps		%%ymm2 ,%%ymm6 				\n\t	vmovaps			%%ymm3 ,%%ymm7 				\n\t"/*	t30 =t26;	t31 =t27; */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm6 		\n\t"/*	 FMA231(_d,__c73,t30);		FNMA231(_d,__c73,t26); */\
		"vfnmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm7 		\n\t"/*	FNMA231(_c,__c73,t31);		 FMA231(_c,__c73,t27); */\
		"vmovaps		%%ymm4 ,     (%%rcx)		\n\t	vmovaps			%%ymm0 ,     (%%rax)		\n\t"/* Write outputs into local store */\
		"vmovaps		%%ymm5 ,0x020(%%rcx)		\n\t	vmovaps			%%ymm1 ,0x020(%%rax)		\n\t"\
		"vmovaps		%%ymm6 ,     (%%rdx)		\n\t	vmovaps			%%ymm2 ,     (%%rbx)		\n\t"\
		"vmovaps		%%ymm7 ,0x020(%%rdx)		\n\t	vmovaps			%%ymm3 ,0x020(%%rbx)		\n\t"\
		"\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"subq	$0x0c0,%%rsi 		/* revert cc-ptr to base value */\n\t"\
		"\n\t"\
		/*...Read t0,8,16,24 from local store ... Do the 4 Im-part FMAs first, because their results needed 1st below */\
		"vbroadcastsd	0x050(%%rsi),%%ymm14		\n\t	vbroadcastsd	0x0d0(%%rsi),%%ymm15		\n\t"/* load __c2,c31 into pair of regs */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			     (%%rbx),%%ymm2 		\n\t"/*    t00;    t08; */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			     (%%rdx),%%ymm6 		\n\t"/*    t16;    t24; */\
		"vmovaps			 %%ymm0 ,%%ymm8 		\n\t	vmovaps				 %%ymm4 ,%%ymm10		\n\t"/* _a=t00; _c=t16; */\
		" vfmadd231pd	%%ymm2 ,%%ymm14,%%ymm0 		\n\t	 vfmadd231pd	%%ymm6 ,%%ymm15,%%ymm4 		\n\t"/*	 FMA231(t08,__c2 ,t00);		 FMA231(t24,__c31,t16); */\
		"vmovaps		0x020(%%rax),%%ymm1 		\n\t	vmovaps			0x020(%%rbx),%%ymm3 		\n\t"/*    t01;    t09; */\
		"vmovaps		0x020(%%rcx),%%ymm5 		\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*    t17;    t25; */\
		"vmovaps			 %%ymm1 ,%%ymm9 		\n\t	vmovaps				 %%ymm5 ,%%ymm11		\n\t"/* _b=t01; _d=t17; */\
		" vfmadd231pd	%%ymm3 ,%%ymm14,%%ymm1 		\n\t	 vfmadd231pd	%%ymm7 ,%%ymm15,%%ymm5 		\n\t"/*	 FMA231(t09,__c2 ,t01);		 FMA231(t25,__c31,t17); */\
		"vfnmadd231pd	%%ymm2 ,%%ymm14,%%ymm8 		\n\t	vfnmadd231pd	%%ymm6 ,%%ymm15,%%ymm10		\n\t"/*	FNMA231(t08,__c2 ,_a );		FNMA231(t24,__c31,_c ); */\
		"vbroadcastsd	0x090(%%rsi),%%ymm6 		\n\t"/* load __c1 */\
		"vfnmadd231pd	%%ymm3 ,%%ymm14,%%ymm9 		\n\t	vfnmadd231pd	%%ymm7 ,%%ymm15,%%ymm11		\n\t"/*	FNMA231(t09,__c2 ,_b );		FNMA231(t25,__c31,_d ); */\
		"vmovaps			 %%ymm0 ,%%ymm12		\n\t	vmovaps				 %%ymm1 ,%%ymm13		\n\t"/* _e = t00; _f = t01; */\
		" vfmadd231pd	%%ymm4 ,%%ymm6 ,%%ymm0 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm6 ,%%ymm1 		\n\t"/*	 FMA231(t16,__c1 ,t00);		 FMA231(t17,__c1 ,t01); */\
		"vfnmadd231pd	%%ymm4 ,%%ymm6 ,%%ymm12		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm6 ,%%ymm13		\n\t"/*	FNMA231(t16,__c1 ,_e );		FNMA231(t17,__c1 ,_f ); */\
		"vmovaps			 %%ymm8 ,%%ymm2 		\n\t	vmovaps				 %%ymm9 ,%%ymm3 		\n\t"/* t08 = _a ; t09 = _b; */\
		"vfnmadd231pd	%%ymm11,%%ymm6 ,%%ymm2 		\n\t	 vfmadd231pd	%%ymm10,%%ymm6 ,%%ymm3 		\n\t"/*	FNMA231(_d ,__c1 ,t08);		 FMA231(_c ,__c1 ,t09); */\
		" vfmadd231pd	%%ymm11,%%ymm6 ,%%ymm8 		\n\t	vfnmadd231pd	%%ymm10,%%ymm6 ,%%ymm9 		\n\t"/*	 FMA231(_d ,__c1 ,_a );		FNMA231(_c ,__c1 ,_b ); */\
		/* Write outputs - Swap 4/C outputs for DIT */\
		"movq	%[__out0],%%r10		\n\t"\
		"leaq	%c[__o4](%%r10),%%r12	\n\t"/* __out0 +   [4*ostride] */\
		"leaq	%c[__o4](%%r12),%%r11	\n\t"/* __out0 + 2*[4*ostride] */\
		"leaq	%c[__o4](%%r11),%%r13	\n\t"/* __out0 + 3*[4*ostride] */\
		"vmovaps		%%ymm0 ,     (%%r10)		\n\t	vmovaps			%%ymm1 ,0x020(%%r10)		\n\t"/* __B0r= t00;		__B0i= t01; */\
		"vmovaps		%%ymm12,     (%%r11)		\n\t	vmovaps			%%ymm13,0x020(%%r11)		\n\t"/* __B8r= _e ;		__B8i= _f ; */\
		"vmovaps		%%ymm2 ,     (%%r13)		\n\t	vmovaps			%%ymm3 ,0x020(%%r13)		\n\t"/* __Bcr= t08;		__Bci= t09; */\
		"vmovaps		%%ymm8 ,     (%%r12)		\n\t	vmovaps			%%ymm9 ,0x020(%%r12)		\n\t"/* __B4r= _a ;		__B4i= _b ; */\
		"\n\t"\
		/*...Block 2: t4,12,20,28 */\
		"vbroadcastsd	0x110(%%rsi),%%ymm13	\n\t"/* cc0 + 0x22 = __two; Actually holds 1.0 in AVX2 mode */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
		"vbroadcastsd	0x050(%%rsi),%%ymm14		\n\t	vbroadcastsd	0x0d0(%%rsi),%%ymm15		\n\t"/* load __c2,31 into pair of regs */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			0x020(%%rcx),%%ymm5 		\n\t"/*    t20;    t21; */\
		"vmovaps		     (%%rdx),%%ymm6 		\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*    t28;    t29; */\
		"vmovaps			 %%ymm4 ,%%ymm10 		\n\t	vmovaps				 %%ymm7 ,%%ymm11		\n\t"/* _c=t20; _d=t29; */\
		" vfmadd231pd	%%ymm5 ,%%ymm13,%%ymm10		\n\t	 vfmsub231pd	%%ymm5 ,%%ymm13,%%ymm4 		\n\t"/*	FNMA231(t21,1.0,_c );		 FMA231(t21,1.0,t20); */\
		" vfmsub231pd	%%ymm6 ,%%ymm13,%%ymm11		\n\t	 vfmadd231pd	%%ymm6 ,%%ymm13,%%ymm7 		\n\t"/*	 FMA231(t28,1.0,_d );		FNMA231(t28,1.0,t29); */\
		"vbroadcastsd	0x010(%%rsi),%%ymm13		\n\t"/* load __c1i2 */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			0x020(%%rax),%%ymm1 		\n\t"/*    t04;    t05; */\
		"vmovaps		     (%%rbx),%%ymm2 		\n\t	vmovaps			0x020(%%rbx),%%ymm3 		\n\t"/*    t12;    t13; */\
		"vmovaps			 %%ymm0 ,%%ymm8 		\n\t	vmovaps				 %%ymm1 ,%%ymm9 		\n\t"/* _a = t04; _b = t05; */\
		"vmovaps			 %%ymm10,%%ymm5 		\n\t	vmovaps				 %%ymm4 ,%%ymm12		\n\t"/* t21 = _c; _e = t20; */\
		"vfnmadd231pd	%%ymm3 ,%%ymm14,%%ymm8 		\n\t	 vfmadd231pd	%%ymm11,%%ymm15,%%ymm5 		\n\t"/*	 FMA231(t13,__c2 ,_a );		 FMA231(_d ,__c31,t21); */\
		" vfmadd231pd	%%ymm2 ,%%ymm14,%%ymm9 		\n\t	 vfmadd231pd	%%ymm7 ,%%ymm15,%%ymm4 		\n\t"/*	FNMA231(t12,__c2 ,_b );		 FMA231(t29,__c31,t20); */\
		" vfmadd231pd	%%ymm3 ,%%ymm14,%%ymm0 		\n\t	vfnmadd231pd	%%ymm11,%%ymm15,%%ymm10		\n\t"/*	FNMA231(t13,__c2 ,t04);		FNMA231(_d ,__c31,_c ); */\
		"vfnmadd231pd	%%ymm2 ,%%ymm14,%%ymm1 		\n\t	vfnmadd231pd	%%ymm7 ,%%ymm15,%%ymm12		\n\t"/*	 FMA231(t12,__c2 ,t05);		FNMA231(t29,__c31,_e ); */\
		"vmovaps			 %%ymm8 ,%%ymm2 		\n\t	vmovaps				 %%ymm9 ,%%ymm3 		\n\t"/* t12 = _a; t13 = _b; */\
		"vfnmadd231pd	%%ymm4 ,%%ymm13,%%ymm2 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm13,%%ymm3 		\n\t"/*	FNMA231(t20,__c1i2,t12);	 FMA231(t21,__c1i2,t13); */\
		" vfmadd231pd	%%ymm4 ,%%ymm13,%%ymm8 		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm13,%%ymm9 		\n\t"/*	 FMA231(t20,__c1i2,_a );	FNMA231(t21,__c1i2,_b ); */\
		"vmovaps			 %%ymm0 ,%%ymm11		\n\t	vmovaps				 %%ymm1 ,%%ymm7 		\n\t"/* _d = t04; t29 = t05; */\
		" vfmadd231pd	%%ymm10,%%ymm13,%%ymm0 		\n\t	 vfmadd231pd	%%ymm12,%%ymm13,%%ymm1 		\n\t"/*	 FMA231(_c ,__c1i2,t04);	 FMA231(_e ,__c1i2,t05); */\
		"vfnmadd231pd	%%ymm10,%%ymm13,%%ymm11		\n\t	vfnmadd231pd	%%ymm12,%%ymm13,%%ymm7 		\n\t"/*	FNMA231(_c ,__c1i2,_d );	FNMA231(_e ,__c1i2,t29); */\
		/* Write outputs - Not sure why, but need apply 6/E swap here *and* then pairwise swap, i.e. 2A[6][E] => [2A][E6] => E62A: */\
		"addq	$%c[__o2],%%r10	\n\t"/* __out0 + 2*ostride */\
		"addq	$%c[__o2],%%r12	\n\t"/* __out0 + 6*ostride */\
		"addq	$%c[__o2],%%r11	\n\t"/* __out0 + a*ostride */\
		"addq	$%c[__o2],%%r13	\n\t"/* __out0 + e*ostride */\
		"vmovaps		%%ymm2 ,     (%%r13)		\n\t	vmovaps			%%ymm3 ,0x020(%%r13)		\n\t"/* __BEr= t12;		__BEi= t13; */\
		"vmovaps		%%ymm8 ,     (%%r12)		\n\t	vmovaps			%%ymm9 ,0x020(%%r12)		\n\t"/* __B6r= _a ;		__B6i= _b ; */\
		"vmovaps		%%ymm0 ,     (%%r10)		\n\t	vmovaps			%%ymm1 ,0x020(%%r10)		\n\t"/* __B2r= t04;		__B2i= t05; */\
		"vmovaps		%%ymm11,     (%%r11)		\n\t	vmovaps			%%ymm7 ,0x020(%%r11)		\n\t"/* __BAr= _d ;		__BAi= t29; */\
		"\n\t"\
		/*...Block 1: t2,10,18,26 */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
		"vmovaps		     (%%rbx),%%ymm2 		\n\t	vmovaps			0x020(%%rbx),%%ymm3 		\n\t"/*    t10;    t11; */\
		"vbroadcastsd	0x008(%%rsi),%%ymm15		\n\t"/* load __sc  */\
		"vbroadcastsd	0x0d0(%%rsi),%%ymm14		\n\t"/* load __c31 */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			0x020(%%rcx),%%ymm5 		\n\t"/*    t18;    t19; */\
		"vmovaps		     (%%rdx),%%ymm6 		\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*    t26;    t27; */\
		"vaddpd		%%ymm3 ,%%ymm2 ,%%ymm12 		\n\t"/* _e = t11+t10; */\
		"vsubpd		%%ymm2 ,%%ymm3 ,%%ymm13			\n\t"/* _f = t11-t10; */\
		"vmovaps			 %%ymm4 ,%%ymm10 		\n\t	vmovaps				 %%ymm7 ,%%ymm8 		\n\t"/* _c = t18; _a = t27; */\
		" vfmadd231pd	%%ymm5 ,%%ymm15,%%ymm10		\n\t	 vfmadd231pd	%%ymm6 ,%%ymm15,%%ymm8 		\n\t"/*	FNMA231(t19,__sc,_c );		 FMS231(t26,__sc,_a ); */\
		"vmovaps			 %%ymm5 ,%%ymm11 		\n\t	vmovaps				 %%ymm6 ,%%ymm9 		\n\t"/* _d = t19; _b = t26; */\
		"vbroadcastsd	0x018(%%rsi),%%ymm6 		\n\t"/* load __c2i2 */\
		"vfnmadd231pd	%%ymm4 ,%%ymm15,%%ymm11		\n\t	 vfmsub231pd	%%ymm7 ,%%ymm15,%%ymm9 		\n\t"/*	 FMA231(t18,__sc,_d );		 FMA231(t27,__sc,_b ); */\
		"vbroadcastsd	(%%rsi),%%ymm15				\n\t"/* load __c1_c */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			0x020(%%rax),%%ymm1 		\n\t"/*    t02;    t03; */\
		"vmovaps			 %%ymm10,%%ymm4 		\n\t	vmovaps				 %%ymm0 ,%%ymm2 		\n\t"/* t18 = _c;	t10 = t02; */\
		" vfmadd231pd	%%ymm8 ,%%ymm14,%%ymm4 		\n\t	 vfmadd231pd	%%ymm12,%%ymm6 ,%%ymm0 		\n\t"/*	 FMA231(_a ,__c31,t18);		 FMA231(_e ,__c2i2,t02); */\
		"vmovaps			 %%ymm11,%%ymm5 		\n\t	vmovaps				 %%ymm1 ,%%ymm3 		\n\t"/* t19 = _d;	t11 = t03; */\
		" vfmadd231pd	%%ymm9 ,%%ymm14,%%ymm5 		\n\t	 vfmadd231pd	%%ymm13,%%ymm6 ,%%ymm1 		\n\t"/*	 FMA231(_b ,__c31,t19);		 FMA231(_f ,__c2i2,t03); */\
		"vfnmadd231pd	%%ymm8 ,%%ymm14,%%ymm10		\n\t	vfnmadd231pd	%%ymm12,%%ymm6 ,%%ymm2 		\n\t"/*	FNMA231(_a ,__c31,_c );		FNMA231(_e ,__c2i2,t10); */\
		"vfnmadd231pd	%%ymm9 ,%%ymm14,%%ymm11		\n\t	vfnmadd231pd	%%ymm13,%%ymm6 ,%%ymm3 		\n\t"/*	FNMA231(_b ,__c31,_d );		FNMA231(_f ,__c2i2,t11); */\
		"vmovaps			 %%ymm0 ,%%ymm8 		\n\t	vmovaps				 %%ymm1 ,%%ymm9 		\n\t"/* _a = t02; _b = t03; */\
		" vfmadd231pd	%%ymm4 ,%%ymm15,%%ymm0 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm15,%%ymm1 		\n\t"/*	 FMA231(t18,__c1_c,t02);	 FMA231(t19,__c1_c,t03); */\
		"vfnmadd231pd	%%ymm4 ,%%ymm15,%%ymm8 		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm15,%%ymm9 		\n\t"/*	FNMA231(t18,__c1_c,_a );	FNMA231(t19,__c1_c,_b ); */\
		"vmovaps			 %%ymm2 ,%%ymm12		\n\t	vmovaps				 %%ymm3 ,%%ymm13		\n\t"/* _e = t10; _f = t11; */\
		"vfnmadd231pd	%%ymm11,%%ymm15,%%ymm2 		\n\t	 vfmadd231pd	%%ymm10,%%ymm15,%%ymm3 		\n\t"/*	FNMA231(_d ,__c1_c,t10);	 FMA231(_c ,__c1_c,t11); */\
		" vfmadd231pd	%%ymm11,%%ymm15,%%ymm12		\n\t	vfnmadd231pd	%%ymm10,%%ymm15,%%ymm13		\n\t"/*	 FMA231(_d ,__c1_c,_e );	FNMA231(_c ,__c1_c,_f ); */\
		/* Write outputs: Swap 5/D outputs for DIT */\
		"subq	$%c[__o1],%%r10	\n\t"/* __out0 + 1*ostride */\
		"subq	$%c[__o1],%%r12	\n\t"/* __out0 + 5*ostride */\
		"subq	$%c[__o1],%%r11	\n\t"/* __out0 + 9*ostride */\
		"subq	$%c[__o1],%%r13	\n\t"/* __out0 + d*ostride */\
		"vmovaps		%%ymm0 ,     (%%r10)		\n\t	vmovaps			%%ymm1 ,0x020(%%r10)		\n\t"/* __B1r= t02;		__B1i= t03; */\
		"vmovaps		%%ymm8 ,     (%%r11)		\n\t	vmovaps			%%ymm9 ,0x020(%%r11)		\n\t"/* __B9r= _a ;		__B9i= _b ; */\
		"vmovaps		%%ymm2 ,     (%%r13)		\n\t	vmovaps			%%ymm3 ,0x020(%%r13)		\n\t"/* __BDr= t10;		__BDi= t11; */\
		"vmovaps		%%ymm12,     (%%r12)		\n\t	vmovaps			%%ymm13,0x020(%%r12)		\n\t"/* __B5r= _e ;		__B5i= _f ; */\
		"\n\t"\
		/*...Block 3: t6,14,22,30 */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
		"vmovaps		     (%%rbx),%%ymm2 		\n\t	vmovaps			0x020(%%rbx),%%ymm3 		\n\t"/*    t14;    t15; */\
		"vbroadcastsd	0x008(%%rsi),%%ymm15		\n\t"/* load __sc  */\
		"vmovaps		     (%%rcx),%%ymm4 		\n\t	vmovaps			0x020(%%rcx),%%ymm5 		\n\t"/*    t22;    t23; */\
		"vmovaps		     (%%rdx),%%ymm6 		\n\t	vmovaps			0x020(%%rdx),%%ymm7 		\n\t"/*    t30;    t31; */\
		"vsubpd		%%ymm3 ,%%ymm2 ,%%ymm10 		\n\t"/* _c = t14-t15; */\
		"vaddpd		%%ymm3 ,%%ymm2 ,%%ymm11			\n\t"/* _d = t14+t15; */\
		"vbroadcastsd	0x0d0(%%rsi),%%ymm14		\n\t"/* load __c31 */\
		"vmovaps			 %%ymm5 ,%%ymm12 		\n\t	vmovaps				 %%ymm4 ,%%ymm13		\n\t"/* _e = t23; _f = t22;*/\
		" vfmadd231pd	%%ymm4 ,%%ymm15,%%ymm12		\n\t	 vfmsub231pd	%%ymm5 ,%%ymm15,%%ymm13		\n\t"/*	 FMS231(t22,__sc,_e );		 FMA231(t23,__sc,_f );*/\
		"vmovaps			 %%ymm6 ,%%ymm8  		\n\t	vmovaps				 %%ymm7 ,%%ymm9 		\n\t"/* _a = t30; _b = t31; */\
		" vfmadd231pd	%%ymm7 ,%%ymm15,%%ymm8 		\n\t	vfnmadd231pd	%%ymm6 ,%%ymm15,%%ymm9 		\n\t"/*	FNMA231(t31,__sc,_a );		 FMA231(t30,__sc,_b );*/\
		"vbroadcastsd	0x018(%%rsi),%%ymm6 		\n\t"/* load __c2i2 */\
		"vbroadcastsd	(%%rsi),%%ymm15				\n\t"/* load __c1_c */\
		"vmovaps		     (%%rax),%%ymm0 		\n\t	vmovaps			0x020(%%rax),%%ymm1 		\n\t"/*    t06;    t07; */\
		"vmovaps			 %%ymm1 ,%%ymm3 		\n\t	vmovaps				 %%ymm0 ,%%ymm2 		\n\t"/* t15= t07;	t14= t06; */\
		"vfnmadd231pd	%%ymm11,%%ymm6 ,%%ymm1 		\n\t	vfnmadd231pd	%%ymm10,%%ymm6 ,%%ymm0 		\n\t"/*	FNMA231(_d ,__c2i2,t07);	FNMA231(_c ,__c2i2,t06); */\
		"vmovaps			 %%ymm12,%%ymm4 		\n\t	vmovaps				 %%ymm13,%%ymm5 		\n\t"/* t22= _e; t23= _f; */\
		"vfnmadd231pd	%%ymm8 ,%%ymm14,%%ymm4 		\n\t	vfnmadd231pd	%%ymm9 ,%%ymm14,%%ymm5 		\n\t"/*	FNMA231(_a ,__c31 ,t22);	FNMA231(_b ,__c31 ,t23); */\
		" vfmadd231pd	%%ymm8 ,%%ymm14,%%ymm12		\n\t	 vfmadd231pd	%%ymm9 ,%%ymm14,%%ymm13		\n\t"/*	 FMA231(_a ,__c31 ,_e );	 FMA231(_b ,__c31 ,_f ); */\
		" vfmadd231pd	%%ymm10,%%ymm6 ,%%ymm2 		\n\t	 vfmadd231pd	%%ymm11,%%ymm6 ,%%ymm3 		\n\t"/*	 FMA231(_c ,__c2i2,t14);	 FMA231(_d ,__c2i2,t15); */\
		"vmovaps			 %%ymm0 ,%%ymm8 		\n\t	vmovaps				 %%ymm1 ,%%ymm9 		\n\t"/* _a = t06; _b = t07; */\
		" vfmadd231pd	%%ymm4 ,%%ymm15,%%ymm0 		\n\t	 vfmadd231pd	%%ymm5 ,%%ymm15,%%ymm1 		\n\t"/*	 FMA231(t22,__c1_c,t06);	 FMA231(t23,__c1_c,t07); */\
		"vfnmadd231pd	%%ymm4 ,%%ymm15,%%ymm8 		\n\t	vfnmadd231pd	%%ymm5 ,%%ymm15,%%ymm9 		\n\t"/*	FNMA231(t22,__c1_c,_a );	FNMA231(t23,__c1_c,_b ); */\
		"vmovaps			 %%ymm2 ,%%ymm10		\n\t	vmovaps				 %%ymm3 ,%%ymm11		\n\t"/* _c = t14; _d = t15; */\
		"vfnmadd231pd	%%ymm13,%%ymm15,%%ymm2 		\n\t	 vfmadd231pd	%%ymm12,%%ymm15,%%ymm3 		\n\t"/*	FNMA231(_f ,__c1_c,t14);	 FMA231(_e ,__c1_c,t15); */\
		" vfmadd231pd	%%ymm13,%%ymm15,%%ymm10		\n\t	vfnmadd231pd	%%ymm12,%%ymm15,%%ymm11		\n\t"/*	 FMA231(_f ,__c1_c,_c );	FNMA231(_e ,__c1_c,_d ); */\
		/* Write outputs: Swap 7/F outputs for DIT */\
		"addq	$%c[__o2],%%r10	\n\t"/* __out0 + 3*ostride */\
		"addq	$%c[__o2],%%r12	\n\t"/* __out0 + 7*ostride */\
		"addq	$%c[__o2],%%r11	\n\t"/* __out0 + b*ostride */\
		"addq	$%c[__o2],%%r13	\n\t"/* __out0 + f*ostride */\
		"vmovaps		%%ymm0 ,     (%%r10)		\n\t	vmovaps			%%ymm1 ,0x020(%%r10)		\n\t"/* __B3r= t06;		__B3i= t07; */\
		"vmovaps		%%ymm8 ,     (%%r11)		\n\t	vmovaps			%%ymm9 ,0x020(%%r11)		\n\t"/* __BBr= _a ;		__BBi= _b ; */\
		"vmovaps		%%ymm2 ,     (%%r13)		\n\t	vmovaps			%%ymm3 ,0x020(%%r13)		\n\t"/* __BFr= t14;		__BFi= t15; */\
		"vmovaps		%%ymm10,     (%%r12)		\n\t	vmovaps			%%ymm11,0x020(%%r12)		\n\t"/* __B7r= _c ;		__B7i= _d ; */\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__o1] "e" (Xo1)\
		 ,[__o2] "e" (Xo2)\
		 ,[__o3] "e" (Xo3)\
		 ,[__o4] "e" (Xo4)\
		 ,[__cc0] "m" (Xcc0)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// Doubled-data version of PAIR_SQUARE_4_SSE2, taking advantage of 2-per-cycle thruput of AVX2 FMAs:
	#define PAIR_SQUARE_4_AVX2(XtAr, XtBr, XtCr, XtDr, Xc0, Xs0, XuAr, XuBr, XuCr, XuDr, Xc1, Xs1, Xforth)\
	{\
	__asm__ volatile (\
		"movq	%[__tDr]	,%%rdx							\n\t	movq	%[__uDr]	,%%r13		\n\t"\
		"movq	%[__tAr]	,%%rax							\n\t	movq	%[__uAr]	,%%r10		\n\t"\
			"movq	%[__tCr]	,%%rcx							\n\t	movq	%[__uCr]	,%%r12		\n\t"\
			"movq	%[__tBr]	,%%rbx							\n\t	movq	%[__uBr]	,%%r11		\n\t"\
		"vmovaps	    (%%rdx),%%ymm4						\n\t	vmovaps	    (%%r13),%%ymm12		\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5						\n\t	vmovaps	0x20(%%r13),%%ymm13		\n\t"\
		"vshufpd	$5,%%ymm4,%%ymm4,%%ymm4					\n\t	vshufpd	$5,%%ymm12,%%ymm12,%%ymm12		\n\t"\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5					\n\t	vshufpd	$5,%%ymm13,%%ymm13,%%ymm13		\n\t"\
		"vmovaps	    (%%rax),%%ymm6						\n\t	vmovaps	    (%%r10),%%ymm14		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm7						\n\t	vmovaps	0x20(%%r10),%%ymm15		\n\t"\
		"vmulpd				%%ymm6 ,%%ymm4,%%ymm0			\n\t	vmulpd				%%ymm14,%%ymm12,%%ymm8 	\n\t"\
		"vmulpd				%%ymm7 ,%%ymm4,%%ymm1			\n\t	vmulpd				%%ymm15,%%ymm12,%%ymm9 	\n\t"\
		" vfmadd231pd		%%ymm7 ,%%ymm5,%%ymm0			\n\t	 vfmadd231pd		%%ymm15,%%ymm13,%%ymm8 	\n\t"\
		"vfnmadd231pd		%%ymm6 ,%%ymm5,%%ymm1			\n\t	vfnmadd231pd		%%ymm14,%%ymm13,%%ymm9 	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6						\n\t	vmovaps	    (%%r12),%%ymm14		\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7						\n\t	vmovaps	0x20(%%r12),%%ymm15		\n\t"\
		"vshufpd	$5,%%ymm6,%%ymm6,%%ymm6					\n\t	vshufpd	$5,%%ymm14,%%ymm14,%%ymm14		\n\t"\
		"vshufpd	$5,%%ymm7,%%ymm7,%%ymm7					\n\t	vshufpd	$5,%%ymm15,%%ymm15,%%ymm15		\n\t"\
		"vmovaps	    (%%rbx),%%ymm4						\n\t	vmovaps	    (%%r11),%%ymm12		\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5						\n\t	vmovaps	0x20(%%r11),%%ymm13		\n\t"\
		"vmulpd				%%ymm4 ,%%ymm6,%%ymm2			\n\t	vmulpd				%%ymm12,%%ymm14,%%ymm10	\n\t"\
		"vmulpd				%%ymm5 ,%%ymm6,%%ymm3			\n\t	vmulpd				%%ymm13,%%ymm14,%%ymm11	\n\t"\
		" vfmadd231pd		%%ymm5 ,%%ymm7,%%ymm2			\n\t	 vfmadd231pd		%%ymm13,%%ymm15,%%ymm10	\n\t"\
		"vfnmadd231pd		%%ymm4 ,%%ymm7,%%ymm3			\n\t	vfnmadd231pd		%%ymm12,%%ymm15,%%ymm11	\n\t"\
		"\n\t"\
	"movq		%[__forth],%%rdi	\n\t"\
	"leaq	-0x20(%%rdi),%%rdi		\n\t"/* two */\
	"vmovaps	%%ymm11,0xc0(%%rdi)	\n\t"/* Spill ymm11 datum in advance of batch-multiply below in which */\
		/* we use that reg to hold common multiplier ... slot 5 reg-widths above forth available for spills */\
		"vmovaps	    (%%rax)	,%%ymm4			\n\t	vmovaps	    (%%rbx)	,%%ymm6					\n\t	vmovaps	    (%%r10)	,%%ymm12			\n\t	vmovaps	    (%%r11)	,%%ymm14			\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm5			\n\t	vmovaps	0x20(%%rbx)	,%%ymm7					\n\t	vmovaps	0x20(%%r10)	,%%ymm13			\n\t	vmovaps	0x20(%%r11)	,%%ymm15			\n\t"\
		"vmulpd		%%ymm4,%%ymm4,%%ymm4		\n\t	vmulpd			%%ymm6,%%ymm6,%%ymm6		\n\t	vmulpd		%%ymm12,%%ymm12,%%ymm12		\n\t	vmulpd			%%ymm14,%%ymm14,%%ymm14	\n\t"\
		/* x^2 - y^2: */\
		"vfnmadd231pd	%%ymm5,%%ymm5,%%ymm4	\n\t	vfnmadd231pd	%%ymm7,%%ymm7,%%ymm6		\n\t	vfnmadd231pd	%%ymm13,%%ymm13,%%ymm12	\n\t	vfnmadd231pd	%%ymm15,%%ymm15,%%ymm14	\n\t"\
		"vmovaps	    (%%rax)	,%%ymm5			\n\t	vmovaps	    (%%rbx)	,%%ymm7					\n\t	vmovaps	    (%%r10)	,%%ymm13			\n\t	vmovaps	    (%%r11)	,%%ymm15			\n\t"\
		"vmulpd		0x20(%%rax)	,%%ymm5,%%ymm5	\n\t	vmulpd		0x20(%%rbx)	,%%ymm7,%%ymm7		\n\t	vmulpd		0x20(%%r10)	,%%ymm13,%%ymm13	\n\t	vmulpd		0x20(%%r11)	,%%ymm15,%%ymm15	\n\t"\
		"vaddpd		%%ymm5		,%%ymm5,%%ymm5	\n\t	vaddpd		%%ymm7		,%%ymm7,%%ymm7		\n\t	vaddpd		%%ymm13		,%%ymm13,%%ymm13	\n\t	vaddpd		%%ymm15		,%%ymm15,%%ymm15	\n\t"\
		"vmovaps	%%ymm4	,    (%%rax)		\n\t	vmovaps	%%ymm6	,    (%%rbx)				\n\t	vmovaps	%%ymm12	,    (%%r10)		\n\t	vmovaps	%%ymm14	,    (%%r11)		\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rax)		\n\t	vmovaps	%%ymm7	,0x20(%%rbx)				\n\t	vmovaps	%%ymm13	,0x20(%%r10)		\n\t	vmovaps	%%ymm15	,0x20(%%r11)		\n\t"\
	"vmovaps	(%%rdi),%%ymm11	\n\t"/* the common multiplier. */\
		" vfmsub132pd	%%ymm11,%%ymm4,%%ymm0	\n\t	 vfmsub132pd	%%ymm11,%%ymm6,%%ymm2		\n\t	 vfmsub132pd	%%ymm11,%%ymm12,%%ymm8 	\n\t	 vfmsub132pd	%%ymm11,%%ymm14,%%ymm10	\n\t"\
		" vfmsub132pd	%%ymm11,%%ymm5,%%ymm1	\n\t	 vfmsub132pd	%%ymm11,%%ymm7,%%ymm3		\n\t	 vfmsub132pd	%%ymm11,%%ymm13,%%ymm9 	\n\t	 vfmsub132pd 0xc0(%%rdi),%%ymm15,%%ymm11	\n\t"/* Restore spilled ymm11-datum via mem-multiplicand */\
		"										\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm4			\n\t	vmovaps	    (%%rcx)	,%%ymm6					\n\t	vmovaps	    (%%r13)	,%%ymm12			\n\t	vmovaps	    (%%r12)	,%%ymm14			\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm5			\n\t	vmovaps	0x20(%%rcx)	,%%ymm7					\n\t	vmovaps	0x20(%%r13)	,%%ymm13			\n\t	vmovaps	0x20(%%r12)	,%%ymm15			\n\t"\
		"vmulpd			%%ymm4,%%ymm4,%%ymm4	\n\t	vmulpd			%%ymm6,%%ymm6,%%ymm6		\n\t	vmulpd			%%ymm12,%%ymm12,%%ymm12	\n\t	vmulpd			%%ymm14,%%ymm14,%%ymm14	\n\t"\
		"vfnmadd231pd	%%ymm5,%%ymm5,%%ymm4	\n\t	vfnmadd231pd	%%ymm7,%%ymm7,%%ymm6		\n\t	vfnmadd231pd	%%ymm13,%%ymm13,%%ymm12	\n\t	vfnmadd231pd	%%ymm15,%%ymm15,%%ymm14	\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm5			\n\t	vmovaps	    (%%rcx)	,%%ymm7					\n\t	vmovaps	    (%%r13)	,%%ymm13			\n\t	vmovaps	    (%%r12)	,%%ymm15			\n\t"\
		"vmulpd		0x20(%%rdx)	,%%ymm5,%%ymm5	\n\t	vmulpd		0x20(%%rcx)	,%%ymm7,%%ymm7		\n\t	vmulpd		0x20(%%r13)	,%%ymm13,%%ymm13	\n\t	vmulpd		0x20(%%r12)	,%%ymm15,%%ymm15	\n\t"\
		"vaddpd		%%ymm5		,%%ymm5,%%ymm5	\n\t	vaddpd		%%ymm7		,%%ymm7,%%ymm7		\n\t	vaddpd		%%ymm13		,%%ymm13,%%ymm13	\n\t	vaddpd		%%ymm15		,%%ymm15,%%ymm15	\n\t"\
		"vmovaps	%%ymm4	,    (%%rdx)		\n\t	vmovaps	%%ymm6	,    (%%rcx)				\n\t	vmovaps	%%ymm12	,    (%%r13)		\n\t	vmovaps	%%ymm14	,    (%%r12)		\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rdx)		\n\t	vmovaps	%%ymm7	,0x20(%%rcx)				\n\t	vmovaps	%%ymm13	,0x20(%%r13)		\n\t	vmovaps	%%ymm15	,0x20(%%r12)		\n\t"\
		"vshufpd	$5,%%ymm4,%%ymm4,%%ymm4		\n\t	vshufpd	$5,%%ymm6,%%ymm6,%%ymm6				\n\t	vshufpd	$5,%%ymm12,%%ymm12,%%ymm12		\n\t	vshufpd	$5,%%ymm14,%%ymm14,%%ymm14		\n\t"\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5		\n\t	vshufpd	$5,%%ymm7,%%ymm7,%%ymm7				\n\t	vshufpd	$5,%%ymm13,%%ymm13,%%ymm13		\n\t	vshufpd	$5,%%ymm15,%%ymm15,%%ymm15		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t	vsubpd	%%ymm6,%%ymm2,%%ymm2				\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 			\n\t	vsubpd	%%ymm14,%%ymm10,%%ymm10			\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1			\n\t	vaddpd	%%ymm7,%%ymm3,%%ymm3				\n\t	vaddpd	%%ymm13,%%ymm9 ,%%ymm9 			\n\t	vaddpd	%%ymm15,%%ymm11,%%ymm11			\n\t"\
		"\n\t"\
		"movq	%[__c0]		,%%rax				\n\t	movq	%[__c1]		,%%r10		\n\t"\
		"movq	%[__s0]		,%%rbx				\n\t	movq	%[__s1]		,%%r11		\n\t"\
		"vmovaps	    (%%rax),%%ymm6			\n\t	vmovaps	    (%%r10),%%ymm14		\n\t"\
		"vmovaps		(%%rbx),%%ymm7			\n\t	vmovaps		(%%r11),%%ymm15		\n\t"\
	"leaq	0x20(%%rdi),%%rdi	\n\t"/* forth, from two */\
		"vmovaps	%%ymm0		,%%ymm4			\n\t	vmovaps	%%ymm8 		,%%ymm12		\n\t"\
		"vmovaps	%%ymm1		,%%ymm5			\n\t	vmovaps	%%ymm9 		,%%ymm13		\n\t"\
		" vfmadd132pd	%%ymm6 ,%%ymm4,%%ymm0	\n\t	 vfmadd132pd	%%ymm14,%%ymm12,%%ymm8 		\n\t"\
		" vfmadd132pd	%%ymm6 ,%%ymm5,%%ymm1	\n\t	 vfmadd132pd	%%ymm14,%%ymm13,%%ymm9 		\n\t"\
		"vfnmadd231pd	%%ymm7 ,%%ymm5,%%ymm0	\n\t	vfnmadd231pd	%%ymm15,%%ymm13,%%ymm8 		\n\t"\
		" vfmadd231pd	%%ymm7 ,%%ymm4,%%ymm1	\n\t	 vfmadd231pd	%%ymm15,%%ymm12,%%ymm9 		\n\t"\
		"vmovaps	%%ymm2	,%%ymm4				\n\t	vmovaps	%%ymm10	,%%ymm12		\n\t"\
		"vmovaps	%%ymm3	,%%ymm5				\n\t	vmovaps	%%ymm11	,%%ymm13		\n\t"\
		" vfmsub132pd	%%ymm7 ,%%ymm4,	%%ymm2	\n\t	 vfmsub132pd	%%ymm15,%%ymm12,%%ymm10		\n\t"\
		" vfmsub132pd	%%ymm7 ,%%ymm5,	%%ymm3	\n\t	 vfmsub132pd	%%ymm15,%%ymm13,%%ymm11		\n\t"\
		" vfmadd231pd	%%ymm6 ,%%ymm5,	%%ymm2	\n\t	 vfmadd231pd	%%ymm14,%%ymm13,%%ymm10		\n\t"\
		"vfnmadd231pd	%%ymm6 ,%%ymm4,	%%ymm3	\n\t	vfnmadd231pd	%%ymm14,%%ymm12,%%ymm11		\n\t"\
		"vmovaps	(%%rdi),%%ymm4	\n\t"/* 0.25 */\
		"vmulpd	%%ymm4,%%ymm0,%%ymm0						\n\t	vmulpd	%%ymm4,%%ymm8 ,%%ymm8 		\n\t"\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1						\n\t	vmulpd	%%ymm4,%%ymm9 ,%%ymm9 		\n\t"\
		"vmulpd	%%ymm4,%%ymm2,%%ymm2						\n\t	vmulpd	%%ymm4,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	%%ymm4,%%ymm3,%%ymm3						\n\t	vmulpd	%%ymm4,%%ymm11,%%ymm11		\n\t"\
		"\n\t"\
		"movq	%[__tAr]	,%%rax							\n\t	movq	%[__uAr]	,%%r10		\n\t"\
		"movq	%[__tBr]	,%%rbx							\n\t	movq	%[__uBr]	,%%r11		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rax)	,%%ymm4						\n\t	vmovaps	    (%%r10)	,%%ymm12		\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm5						\n\t	vmovaps	0x20(%%r10)	,%%ymm13		\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm6						\n\t	vmovaps	    (%%r11)	,%%ymm14		\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm7						\n\t	vmovaps	0x20(%%r11)	,%%ymm15		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4						\n\t	vaddpd	%%ymm8 ,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5						\n\t	vaddpd	%%ymm9 ,%%ymm13,%%ymm13		\n\t"\
		"vsubpd	%%ymm2,%%ymm6,%%ymm6						\n\t	vsubpd	%%ymm10,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm3,%%ymm7,%%ymm7						\n\t	vsubpd	%%ymm11,%%ymm15,%%ymm15		\n\t"\
		"vmovaps	%%ymm4	,    (%%rax)					\n\t	vmovaps	%%ymm12	,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rax)					\n\t	vmovaps	%%ymm13	,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm6	,    (%%rbx)					\n\t	vmovaps	%%ymm14	,    (%%r11)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rbx)					\n\t	vmovaps	%%ymm15	,0x20(%%r11)	\n\t"\
		"\n\t"\
		"movq	%[__tCr]	,%%rcx							\n\t	movq	%[__uCr]	,%%r12		\n\t"\
		"movq	%[__tDr]	,%%rdx							\n\t	movq	%[__uDr]	,%%r13		\n\t"\
		"\n\t"\
		"vshufpd	$5,%%ymm0,%%ymm0,%%ymm0					\n\t	vshufpd	$5,%%ymm8 ,%%ymm8 ,%%ymm8 		\n\t"\
		"vshufpd	$5,%%ymm1,%%ymm1,%%ymm1					\n\t	vshufpd	$5,%%ymm9 ,%%ymm9 ,%%ymm9 		\n\t"\
		"vshufpd	$5,%%ymm2,%%ymm2,%%ymm2					\n\t	vshufpd	$5,%%ymm10,%%ymm10,%%ymm10		\n\t"\
		"vshufpd	$5,%%ymm3,%%ymm3,%%ymm3					\n\t	vshufpd	$5,%%ymm11,%%ymm11,%%ymm11		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm4						\n\t	vmovaps	    (%%r13)	,%%ymm12		\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm5						\n\t	vmovaps	0x20(%%r13)	,%%ymm13		\n\t"\
		"vmovaps	    (%%rcx)	,%%ymm6						\n\t	vmovaps	    (%%r12)	,%%ymm14		\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm7						\n\t	vmovaps	0x20(%%r12)	,%%ymm15		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4						\n\t	vaddpd	%%ymm8 ,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm1,%%ymm5,%%ymm5						\n\t	vsubpd	%%ymm9 ,%%ymm13,%%ymm13		\n\t"\
		"vsubpd	%%ymm2,%%ymm6,%%ymm6						\n\t	vsubpd	%%ymm10,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7						\n\t	vaddpd	%%ymm11,%%ymm15,%%ymm15		\n\t"\
		"vmovaps	%%ymm4	,    (%%rdx)					\n\t	vmovaps	%%ymm12	,    (%%r13)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rdx)					\n\t	vmovaps	%%ymm13	,0x20(%%r13)	\n\t"\
		"vmovaps	%%ymm6	,    (%%rcx)					\n\t	vmovaps	%%ymm14	,    (%%r12)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rcx)					\n\t	vmovaps	%%ymm15	,0x20(%%r12)	\n\t"\
		:					/* outputs: none */\
		: [__tAr] "m" (XtAr)	/* All inputs from memory addresses here */\
		 ,[__tBr] "m" (XtBr)\
		 ,[__tCr] "m" (XtCr)\
		 ,[__tDr] "m" (XtDr)\
		 ,[__c0] "m" (Xc0)\
		 ,[__s0] "m" (Xs0)\
		 ,[__uAr] "m" (XuAr)\
		 ,[__uBr] "m" (XuBr)\
		 ,[__uCr] "m" (XuCr)\
		 ,[__uDr] "m" (XuDr)\
		 ,[__c1] "m" (Xc1)\
		 ,[__s1] "m" (Xs1)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// AVX version has shufpd immediate = 5 = 0101_2, which is the doubled analog of the SSE2 imm8 = 1 = 01_2:
	#define PAIR_SQUARE_4_SSE2(XtAr, XtBr, XtCr, XtDr, Xc, Xs, Xforth)\
	{\
	__asm__ volatile (\
		"movq	%[__tDr]	,%%rdx		\n\t"\
		"movq	%[__tAr]	,%%rax		\n\t"\
			"movq	%[__tCr]	,%%rcx		\n\t"\
			"movq	%[__tBr]	,%%rbx		\n\t"\
	/* Processing of data in regs 0145, 2367 independent -
	   overlap to mitigate latencies: */\
		"vmovaps	    (%%rdx),%%ymm4		\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5		\n\t"\
		"vshufpd	$5,%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	    (%%rcx),%%ymm6		\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7		\n\t"\
		"vshufpd	$5,%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vshufpd	$5,%%ymm7,%%ymm7,%%ymm7		\n\t"\
	/* Overlap of the 2 seqs is why we don't store rax/rabx data in xmm-registers here - already used all available xmm-regs: */\
		"vmulpd			    (%%rax),%%ymm4,%%ymm0	\n\t"\
		"vmulpd			0x20(%%rax),%%ymm4,%%ymm1	\n\t"\
		"vmulpd			    (%%rbx),%%ymm6,%%ymm2	\n\t"\
		"vmulpd			0x20(%%rbx),%%ymm6,%%ymm3	\n\t"\
		" vfmadd231pd	0x20(%%rax),%%ymm5,%%ymm0	\n\t"\
		"vfnmadd231pd	    (%%rax),%%ymm5,%%ymm1	\n\t"\
		" vfmadd231pd	0x20(%%rbx),%%ymm7,%%ymm2	\n\t"\
		"vfnmadd231pd	    (%%rbx),%%ymm7,%%ymm3	\n\t"\
		"\n\t"\
	/* now calculate square terms and __store back in the same temporaries: */\
	"movq		%[__forth],%%rdi		\n\t"\
	"leaq	-0x20(%%rdi),%%rdi		\n\t"/* two */\
	/*	lcol: __tmp=(__tAr+__tAi)*(__tAr-__tAi); __tAi=__tAr*__tAi; __tAi=__tAi+__tAi; __tAr=__tmp;	*/\
	/*	rcol: __tmp=(__tBr+__tBi)*(__tBr-__tBi); __tBi=__tBr*__tBi; __tBi=__tBi+__tBi; __tBr=__tmp;	*/\
		"vmovaps	    (%%rax)	,%%ymm4			\n\t	vmovaps	    (%%rbx)	,%%ymm6			\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm5			\n\t	vmovaps	0x20(%%rbx)	,%%ymm7			\n\t"\
		"vmulpd		%%ymm4,%%ymm4,%%ymm4		\n\t	vmulpd			%%ymm6,%%ymm6,%%ymm6	\n\t"\
		/* x^2 - y^2: */\
		"vfnmadd231pd	%%ymm5,%%ymm5,%%ymm4	\n\t	vfnmadd231pd	%%ymm7,%%ymm7,%%ymm6	\n\t"\
		"vmovaps	    (%%rax)	,%%ymm5			\n\t	vmovaps	    (%%rbx)	,%%ymm7			\n\t"\
		"vmulpd		0x20(%%rax)	,%%ymm5,%%ymm5	\n\t	vmulpd		0x20(%%rbx)	,%%ymm7,%%ymm7	\n\t"\
		"vaddpd		%%ymm5		,%%ymm5,%%ymm5	\n\t	vaddpd		%%ymm7		,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm4	,    (%%rax)		\n\t	vmovaps	%%ymm6	,    (%%rbx)		\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rax)		\n\t	vmovaps	%%ymm7	,0x20(%%rbx)		\n\t"\
		" vfmsub132pd	(%%rdi),%%ymm4,%%ymm0	\n\t	 vfmsub132pd	(%%rdi),%%ymm6,%%ymm2	\n\t"\
		" vfmsub132pd	(%%rdi),%%ymm5,%%ymm1	\n\t	 vfmsub132pd	(%%rdi),%%ymm7,%%ymm3	\n\t"\
		"\n\t"\
	/*	lcol: __tmp=(__tDr+__tDi)*(__tDr-__tDi); __tDi=__tDr*__tDi; __tDi=__tDi+__tDi; __tDr=__tmp;	*/\
	/*	rcol: __tmp=(__tCr+__tCi)*(__tCr-__tCi); __tCi=__tCr*__tCi; __tCi=__tCi+__tCi; __tCr=__tmp;	*/\
		"vmovaps	    (%%rdx)	,%%ymm4			\n\t	vmovaps	    (%%rcx)	,%%ymm6			\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm5			\n\t	vmovaps	0x20(%%rcx)	,%%ymm7			\n\t"\
		"vmulpd			%%ymm4,%%ymm4,%%ymm4	\n\t	vmulpd			%%ymm6,%%ymm6,%%ymm6	\n\t"\
		"vfnmadd231pd	%%ymm5,%%ymm5,%%ymm4	\n\t	vfnmadd231pd	%%ymm7,%%ymm7,%%ymm6	\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm5			\n\t	vmovaps	    (%%rcx)	,%%ymm7			\n\t"\
		"vmulpd		0x20(%%rdx)	,%%ymm5,%%ymm5	\n\t	vmulpd		0x20(%%rcx)	,%%ymm7,%%ymm7	\n\t"\
		"vaddpd		%%ymm5		,%%ymm5,%%ymm5	\n\t	vaddpd		%%ymm7		,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm4	,    (%%rdx)		\n\t	vmovaps	%%ymm6	,    (%%rcx)		\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rdx)		\n\t	vmovaps	%%ymm7	,0x20(%%rcx)		\n\t"\
		"vshufpd	$5,%%ymm4,%%ymm4,%%ymm4		\n\t	vshufpd	$5,%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5		\n\t	vshufpd	$5,%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t	vsubpd	%%ymm6,%%ymm2,%%ymm2			\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1			\n\t	vaddpd	%%ymm7,%%ymm3,%%ymm3			\n\t"\
		"\n\t"\
		"movq	%[__c]		,%%rax		\n\t"\
		"movq	%[__s]		,%%rbx		\n\t"\
	"leaq	0x20(%%rdi),%%rdi		\n\t"/* forth, from two */\
		"vmovaps	%%ymm0		,%%ymm4		\n\t"\
		"vmovaps	%%ymm1		,%%ymm5		\n\t"\
		"vmovaps	%%ymm2	,%%ymm6		\n\t"\
		"vmovaps	%%ymm3	,%%ymm7		\n\t"\
		" vfmadd132pd	(%%rax)	,%%ymm4,%%ymm0		\n\t"\
		" vfmadd132pd	(%%rax)	,%%ymm5,%%ymm1		\n\t"\
		" vfmsub132pd	(%%rbx)	,%%ymm6,	%%ymm2	\n\t"\
		" vfmsub132pd	(%%rbx)	,%%ymm7,	%%ymm3	\n\t"\
		"vfnmadd231pd	(%%rbx)	,%%ymm5,%%ymm0		\n\t"\
		" vfmadd231pd	(%%rbx)	,%%ymm4,%%ymm1		\n\t"\
		" vfmadd231pd	(%%rax)	,%%ymm7,	%%ymm2	\n\t"\
		"vfnmadd231pd	(%%rax)	,%%ymm6,	%%ymm3	\n\t"\
		"vmovaps	(%%rdi),%%ymm4		\n\t"/* 0.25 */\
		"vmulpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"\n\t"\
		"movq	%[__tAr]	,%%rax		\n\t"\
		"movq	%[__tBr]	,%%rbx		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rax)	,%%ymm4		\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm5		\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm6		\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm3,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm4	,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm6	,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rbx)	\n\t"\
		"\n\t"\
		"movq	%[__tCr]	,%%rcx		\n\t"\
		"movq	%[__tDr]	,%%rdx		\n\t"\
		"\n\t"\
		"vshufpd	$5,%%ymm0,%%ymm0,%%ymm0		\n\t"\
		"vshufpd	$5,%%ymm1,%%ymm1,%%ymm1		\n\t"\
		"vshufpd	$5,%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vshufpd	$5,%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm4		\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm5		\n\t"\
		"vmovaps	    (%%rcx)	,%%ymm6		\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm4	,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rdx)	\n\t"\
		"vmovaps	%%ymm6	,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rcx)	\n\t"\
		:					/* outputs: none */\
		: [__tAr] "m" (XtAr)	/* All inputs from memory addresses here */\
		 ,[__tBr] "m" (XtBr)\
		 ,[__tCr] "m" (XtCr)\
		 ,[__tDr] "m" (XtDr)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// Sep 2019: 2-input FFT(a)*FFT(b) version of above PAIR_SQUARE_4_SSE2 macro, based on above ARM SIMD version of PAIR_MUL_4_SSE2.
	// NOTE: Unlike the PAIR_SQUARE_4 version of this macro, the MUL version assumes the sincos terms premultiplied by 1/4!
	// AVX version has shufpd immediate = 0x5 = 0101_2, which is the doubled analog of the SSE2 imm8 = 1 = 01_2:
	#define PAIR_MUL_4_SSE2(XA0,XA1,XA2,XA3, XB0,XB1,XB2,XB3, Xc,Xs,Xforth)\
	{\
	__asm__ volatile (\
		/* Load a2,a3 and b2,b3, d0,d1-swap, then compute
			t0 = ~a3r*~b3r - ~a3i*~b3i, t2 = ~a3r*~b3i + ~a3i*~b3r
			t1 = ~a2r*~b2r - ~a2i*~b2i, t3 = ~a2r*~b2i + ~a2i*~b2r
		*/\
		"movq	%[__A2]	,%%rcx	\n\t"\
		"movq	%[__A3]	,%%rdx	\n\t"\
		"movq	%[__B2]	,%%rdi	\n\t"\
		"movq	%[__B3]	,%%rsi	\n\t"\
		/* Must load double-pairs-to-be-swapped into regs first, since SHUFPD takes low double from DEST and high from SRC: */\
		"vmovaps	    (%%rcx),%%ymm0		\n\t	vshufpd	$5,%%ymm0,%%ymm0,%%ymm0	\n\t"/* ~a2r */\
		"vmovaps	0x20(%%rcx),%%ymm1		\n\t	vshufpd	$5,%%ymm1,%%ymm1,%%ymm1	\n\t"/* ~a2i */\
		"vmovaps	    (%%rdi),%%ymm4		\n\t	vshufpd	$5,%%ymm4,%%ymm4,%%ymm4	\n\t"/* ~b2r */\
		"vmovaps	0x20(%%rdi),%%ymm5		\n\t	vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t"/* ~b2i */\
		"vmovaps	    (%%rdx),%%ymm2		\n\t	vshufpd	$5,%%ymm2,%%ymm2,%%ymm2	\n\t"/* ~a3r */\
		"vmovaps	0x20(%%rdx),%%ymm3		\n\t	vshufpd	$5,%%ymm3,%%ymm3,%%ymm3	\n\t"/* ~a3i */\
		"vmovaps	    (%%rsi),%%ymm6		\n\t	vshufpd	$5,%%ymm6,%%ymm6,%%ymm6	\n\t"/* ~b3r */\
		"vmovaps	0x20(%%rsi),%%ymm7		\n\t	vshufpd	$5,%%ymm7,%%ymm7,%%ymm7	\n\t"/* ~b3i */\
		"vmulpd		%%ymm0	,%%ymm4	,%%ymm8	\n\t"/* ~a2r*~b2r */\
		"vmulpd		%%ymm0	,%%ymm5	,%%ymm9	\n\t"/* ~a2r*~b2i */\
		"vmulpd		%%ymm2	,%%ymm6	,%%ymm10\n\t"/* ~a3r*~b3r */\
		"vmulpd		%%ymm2	,%%ymm7	,%%ymm11\n\t"/* ~a3r*~b3i */\
	"vfnmadd231pd	%%ymm1	,%%ymm5	,%%ymm8	\n\t"/* t1 = ~a2r*~b2r - ~a2i*~b2i */\
	"vfmadd231pd	%%ymm1	,%%ymm4	,%%ymm9	\n\t"/* t3 = ~a2r*~b2i + ~a2i*~b2r */\
	"vfnmadd231pd	%%ymm3	,%%ymm7	,%%ymm10\n\t"/* t0 = ~a3r*~b3r - ~a3i*~b3i */\
	"vfmadd231pd	%%ymm3	,%%ymm6	,%%ymm11\n\t"/* t2 = ~a3r*~b3i + ~a3i*~b3r */\
		/* t1,3 and t0,2 not needed until final butterfly sequence, so write back to A2,3 memlocs: */\
		"vmovaps	%%ymm8	,    (%%rcx)	\n\t	movq	%[__A0]	,%%rax	\n\t"\
		"vmovaps	%%ymm9	,0x20(%%rcx)	\n\t	movq	%[__A1]	,%%rbx	\n\t"\
		"vmovaps	%%ymm10	,    (%%rdx)	\n\t	movq	%[__B0]	,%%rdi	\n\t"\
		"vmovaps	%%ymm11	,0x20(%%rdx)	\n\t	movq	%[__B1]	,%%rsi	\n\t"\
	/* a2,3 in ymm0-3, b2,3 in ymm4-7, t1,3 in (rcx), t0,2 in (rdx) */\
		/* calculate difference terms...these need the [a,b][2|3] vector-data to be d0,1-swapped:
			~a3r -= a0r, ~a3i += a0i,
			~a2r -= a1r, ~a2i += a1i, similar for b-data, but move ~b2 -+ b1 down to just before a1*b1 cmul to free up 2 regs.
		*/\
/*** Need ~a3r = a0r - ~a3r, not ~a3r -= a0r! [Similar for a2r,b3r,b2r] ***
************** As currently, a2r,a3r,b2r,b3r all negated! ****************/\
		"vmovaps	    (%%rax)	,%%ymm8		\n\t	vsubpd	%%ymm8	,%%ymm2	,%%ymm2	\n\t"/* ~a3r -= a0r */\
		"vmovaps	0x20(%%rax)	,%%ymm9		\n\t	vaddpd	%%ymm9	,%%ymm3	,%%ymm3	\n\t"/* ~a3i += a0i */\
		"vmovaps	    (%%rbx)	,%%ymm10	\n\t	vsubpd	%%ymm10	,%%ymm0	,%%ymm0	\n\t"/* ~a2r -= a1r */\
		"vmovaps	0x20(%%rbx)	,%%ymm11	\n\t	vaddpd	%%ymm11	,%%ymm1	,%%ymm1	\n\t"/* ~a2i += a1i */\
		"vmovaps	    (%%rdi)	,%%ymm12	\n\t	vsubpd	%%ymm12	,%%ymm6	,%%ymm6	\n\t"/* ~b3r -= b0r */\
		"vmovaps	0x20(%%rdi)	,%%ymm13	\n\t	vaddpd	%%ymm13	,%%ymm7	,%%ymm7	\n\t"/* ~b3i += b0i */\
		"vmovaps	    (%%rsi)	,%%ymm14	\n\t	vsubpd	%%ymm14	,%%ymm4	,%%ymm4	\n\t"/* ~b2r -= b1r */\
		"vmovaps	0x20(%%rsi)	,%%ymm15	\n\t	vaddpd	%%ymm15	,%%ymm5	,%%ymm5	\n\t"/* ~b2i += b1i */\
		/* now calculate 1st square-like term and store back in H(j) slot:
			t4 = a0r*b0r - a0i*b0i, a0i = a0r*b0i + a0i*b0r, a0r = t4
			t5 = a1r*b1r - a1i*b1i, a1i = a1r*b1i + a1i*b1r, a1r = t5
		*/\
		"vmulpd		    (%%rax)	,%%ymm12,%%ymm8	\n\t"/* a0r*b0r */\
		"vmulpd		    (%%rax)	,%%ymm13,%%ymm9	\n\t"/* a0r*b0i */\
		"vmulpd		    (%%rbx)	,%%ymm14,%%ymm10\n\t"/* a1r*b1r */\
		"vmulpd		    (%%rbx)	,%%ymm15,%%ymm11\n\t"/* a1r*b1i */\
	"vfnmadd231pd	0x20(%%rax)	,%%ymm13,%%ymm8	\n\t"/* a0r' = a0r*b0r - a0i*b0i */\
	"vfmadd231pd	0x20(%%rax)	,%%ymm12,%%ymm9	\n\t"/* a0i' = a0r*b0i + a0i*b0r */\
	"vfnmadd231pd	0x20(%%rbx)	,%%ymm15,%%ymm10\n\t"/* a1r' = a1r*b1r - a1i*b1i */\
	"vfmadd231pd	0x20(%%rbx)	,%%ymm14,%%ymm11\n\t"/* a1i' = a1r*b1i + a1i*b1r */\
	/* a0,1 in ymm8-11, a2,3 in ymm0-3, b2,3 in ymm4-7, t1,3 in (rcx), t0,2 in (rdx) */\
		/* calculate the complex products to build the second term:
			t4 = ~a3r*~b3r - ~a3i*~b3i, ~a3i = ~a3r*~b3i + ~a3i*~b3r, ~a3r,i in ymm2,3, ~b3r,i in ymm6,7
			t5 = ~a2r*~b2r - ~a2i*~b2i, ~a2i = ~a2r*~b2i + ~a2i*~b2r, ~arr,i in ymm0,1, ~b2r,i in ymm4,5
		*/\
/****************** a2r,a3r,b2r,b3r being negated means a2i,a3i come out negated ****************/\
		"vmulpd		%%ymm0	,%%ymm4	,%%ymm12\n\t"/* ~a2r*~b2r */\
		"vmulpd		%%ymm0	,%%ymm5	,%%ymm13\n\t"/* ~a2r*~b2i */\
		"vmulpd		%%ymm2	,%%ymm6	,%%ymm14\n\t"/* ~a3r*~b3r */\
		"vmulpd		%%ymm2	,%%ymm7	,%%ymm15\n\t"/* ~a3r*~b3i */\
	"vfnmadd231pd	%%ymm1	,%%ymm5	,%%ymm12\n\t"/* t5   = ~a2r*~b2r - ~a2i*~b2i */\
	"vfmadd231pd	%%ymm1	,%%ymm4	,%%ymm13\n\t"/* ~a2i = ~a2r*~b2i + ~a2i*~b2r */\
	"vfnmadd231pd	%%ymm3	,%%ymm7	,%%ymm14\n\t"/* t4   = ~a3r*~b3r - ~a3i*~b3i */\
	"vfmadd231pd	%%ymm3	,%%ymm6	,%%ymm15\n\t"/* ~a3i = ~a3r*~b3i + ~a3i*~b3r */\
		/* ymm0-7 free */\
		/* Assume [c0,s1],[s0,c1] sincos vector-data are in the [c] and [s]-input-pointers, then compute
			~a3r = [cc+0.25]*t4 - [ss]*~a3i, ~a3i = [ss]*t4 + [cc+0.25]*~a3i
			~a2r = [0.25-ss]*t5 - [cc]*~a2i, ~a2i = [cc]*t5 + [0.25-ss]*~a2i ,
		where cc = 0.25*[c0,s1] and ss = 0.25*[s0,c1]:
		*/\
/****************** a2i,a3i being negated requires +- sign swap in this next computation ****************/\
		"movq	%[__forth],%%rdi		\n\t	vmovaps	(%%rdi),%%ymm6		\n\t	vmovaps	%%ymm6,%%ymm7	\n\t"/* 2 copies of 0.25 */\
		"movq	%[__c]	,%%rdi			\n\t	vmovaps	(%%rdi),%%ymm4		\n\t"/*	cc assumed premultiplied by 0.25 */\
		"movq	%[__s]	,%%rsi			\n\t	vmovaps	(%%rsi),%%ymm5		\n\t"/*	ss assumed premultiplied by 0.25 */\
		"vaddpd	%%ymm4	,%%ymm6	,%%ymm6	\n\t	vsubpd	%%ymm5	,%%ymm7	,%%ymm7	\n\t"	/* [cc+0.25],[0.25-ss] in ymm6,7 */\
		"vmulpd		%%ymm14	,%%ymm6	,%%ymm2	\n\t"/*   t4*[cc+0.25] */\
		"vmulpd		%%ymm14	,%%ymm5	,%%ymm3	\n\t"/*   t4*[ss] */\
		"vmulpd		%%ymm12	,%%ymm7	,%%ymm0	\n\t"/*   t5*[0.25-ss] */\
		"vmulpd		%%ymm12	,%%ymm4	,%%ymm1	\n\t"/*   t5*[cc] */\
	"vfmadd231pd	%%ymm15	,%%ymm5	,%%ymm2	\n\t"/* ~a3r = [cc+0.25]*t4 - [ss]*~a3i in ymm2 */\
	"vfnmadd231pd	%%ymm15	,%%ymm6	,%%ymm3	\n\t"/* ~a3i = [cc+0.25]*~a3i - [ss]*t4 in ymm3 */\
	"vfmadd231pd	%%ymm13	,%%ymm4	,%%ymm0	\n\t"/* ~a2r = [0.25-ss]*t5 - [cc]*~a2i in ymm0 */\
	"vfnmadd231pd	%%ymm13	,%%ymm7	,%%ymm1	\n\t"/* ~a2i = [0.25-ss]*~a2i - [cc]*t5 in ymm1 */\
/****************** a2i,a3i in ymm1,3; *NOT* negated as in the sse2|avx case ****************/\
	/* a0,1 in ymm8-11, a2,3 in ymm0-3, t1,3 in (rcx), t0,2 in (rdx) */\
		"vmovaps	    (%%rdx)	,%%ymm4		\n\t"/* t0 */\
		"vmovaps	0x20(%%rdx)	,%%ymm5		\n\t"/* t2 */\
		"vmovaps	    (%%rcx)	,%%ymm6		\n\t"/* t1 */\
		"vmovaps	0x20(%%rcx)	,%%ymm7		\n\t"/* t3 */\
	/* and now complete and store the results:
		a0r -= ~a3r, a0i -= ~a3i
		a1r -= ~a2r, a1i -= ~a2i
	N-j terms:
		~a3r = t0 - ~a3r, ~a3i += t2
		~a2r = t1 - ~a2r, ~a2i += t3
	*/\
/****************** a2i,a3i negated means in rcol instead computing a0,1i += ~a3,2i, a3,2i = t2,3 - a3,2i ****************/\
		"vsubpd	%%ymm2	,%%ymm8	,%%ymm8	\n\t	vsubpd	%%ymm3	,%%ymm9	,%%ymm9	\n\t"	/* a0r,i in v8 ,9 ; ~a3r,i in v2,3 */\
		"vsubpd	%%ymm0	,%%ymm10,%%ymm10\n\t	vsubpd	%%ymm1	,%%ymm11,%%ymm11\n\t"	/* a1r,i in v10,11; ~a2r,i in v0,1 */\
		"vsubpd	%%ymm2	,%%ymm4	,%%ymm4	\n\t	vaddpd	%%ymm3	,%%ymm5	,%%ymm5	\n\t"	/* t0,2 in v4,5 */\
		"vsubpd	%%ymm0	,%%ymm6	,%%ymm6	\n\t	vaddpd	%%ymm1	,%%ymm7	,%%ymm7	\n\t"	/* t1,3 in v6,7 */\
	/* Interleave writes of a0,a1 with un-shufflings of ~a2,~a3: */\
		"vmovaps	%%ymm8	,    (%%rax)	\n\t	vshufpd	$5	,%%ymm4	,%%ymm4,%%ymm4	\n\t"/* ~a3r */\
		"vmovaps	%%ymm9	,0x20(%%rax)	\n\t	vshufpd	$5	,%%ymm5	,%%ymm5,%%ymm5	\n\t"/* ~a3i */\
		"vmovaps	%%ymm10	,    (%%rbx)	\n\t	vshufpd	$5	,%%ymm6	,%%ymm6,%%ymm6	\n\t"/* ~a2r */\
		"vmovaps	%%ymm11	,0x20(%%rbx)	\n\t	vshufpd	$5	,%%ymm7	,%%ymm7,%%ymm7	\n\t"/* ~a2i */\
		"vmovaps	%%ymm4	,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rdx)	\n\t"\
		"vmovaps	%%ymm6	,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rcx)	\n\t"\
		/* Cost (FMA = MUL): [43 vector-load/store (8 implicit), 12 shufpd, 18 addpd, 32 mulpd, 1 vector-register-copy] */\
		:					/* outputs: none */\
		: [__A0] "m" (XA0)	/* All inputs from memory addresses here */\
		 ,[__A1] "m" (XA1)\
		 ,[__A2] "m" (XA2)\
		 ,[__A3] "m" (XA3)\
		 ,[__B0] "m" (XB0)\
		 ,[__B1] "m" (XB1)\
		 ,[__B2] "m" (XB2)\
		 ,[__B3] "m" (XB3)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	//...Radix-7 DFT: Inputs in memlocs __i0-6, outputs into __o0-6, possibly coincident with inputs.

   #ifdef ALL_FMA	// FMAs used for all arithmetic, including 'trivial' ones (one mult = 1.0) to replace ADD/SUB:

	// Aggressive-FMA: replace [6 ADD, 12 SUB, 6 MUL, 42 FMA, 58 memref] ==> [6 MUL, 60 FMA (42 nontrivial), 66 memref].
	// I.e. trade 18 ADD/SUB for same #FMA + 8 LOAD-of-vector-const-1.0, those where spill of a reg-datum in favor of 1.0 not justifiable.
	//
	#define SSE2_RADIX_07_DFT(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6, Xcc,Xtwo, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%r8		\n\t	leaq	0x20(%%r8),%%r9		\n\t"/* two,one */\
		"movq	%[__i1],%%rax		\n\t"\
		"movq	%[__i2],%%rbx		\n\t"\
		"movq	%[__i3],%%rcx		\n\t"\
		"movq	%[__i4],%%rdx		\n\t"\
		"movq	%[__i5],%%rsi		\n\t"\
		"movq	%[__i6],%%rdi		\n\t"		/*** Rcol does Imaginary Parts: ***/\
		"vmovaps	(%%rax),%%ymm6		\n\t	vmovaps	0x20(%%rax),%%ymm14	\n\t"	/* x1 */\
		"vmovaps	(%%rdi),%%ymm1		\n\t	vmovaps	0x20(%%rdi),%%ymm9 	\n\t"	/* x6 */\
		"vmovaps	(%%rbx),%%ymm5		\n\t	vmovaps	0x20(%%rbx),%%ymm13	\n\t"	/* x2 */\
		"vmovaps	(%%rsi),%%ymm2		\n\t	vmovaps	0x20(%%rsi),%%ymm10	\n\t"	/* x5 */\
		"vmovaps	(%%rcx),%%ymm4		\n\t	vmovaps	0x20(%%rcx),%%ymm12	\n\t"	/* x3 */\
		"vmovaps	(%%rdx),%%ymm3		\n\t	vmovaps	0x20(%%rdx),%%ymm11	\n\t"	/* x4 */\
	"vmovaps	(%%r8),%%ymm0		\n\t	vmovaps	(%%r9),%%ymm7		\n\t"/* two,one */\
		"movq	%[__i0],%%rdi		\n\t"\
	"vfmsub132pd %%ymm7,%%ymm1,%%ymm6	\n\t vfmsub132pd %%ymm7,%%ymm9 ,%%ymm14	\n\t"	/* t6 = x1 - x6 */\
	"vfmsub132pd %%ymm7,%%ymm2,%%ymm5	\n\t vfmsub132pd %%ymm7,%%ymm10,%%ymm13	\n\t"	/* t5 = x2 - x5 */\
	"vfmsub132pd %%ymm7,%%ymm3,%%ymm4	\n\t vfmsub132pd %%ymm7,%%ymm11,%%ymm12	\n\t"	/* t4 = x3 - x4 */\
	"vfmadd132pd %%ymm0,%%ymm6,%%ymm1	\n\t vfmadd132pd %%ymm0,%%ymm14,%%ymm9 	\n\t"	/* t1 = x1 + x6 */\
	"vfmadd132pd %%ymm0,%%ymm5,%%ymm2	\n\t vfmadd132pd %%ymm0,%%ymm13,%%ymm10	\n\t"	/* t2 = x2 + x5 */\
	"vfmadd132pd %%ymm0,%%ymm4,%%ymm3	\n\t vfmadd132pd %%ymm0,%%ymm12,%%ymm11	\n\t"	/* t3 = x3 + x4 */\
		"vmovaps	(%%rdi),%%ymm0		\n\t	vmovaps	0x20(%%rdi),%%ymm8 	\n\t"	/* t0 = x0 */\
		"movq	%[__o1],%%rax		\n\t"\
		"movq	%[__o2],%%rbx		\n\t"\
		"movq	%[__o3],%%rcx		\n\t"\
	/* Spill  xi - xj combos to o-slots; these won't be needed until we get to the sine terms: */\
		"vmovaps	%%ymm6,    (%%rax)	\n\t	vmovaps	%%ymm14,0x20(%%rax)	\n\t"/* t6 */\
		"vmovaps	%%ymm5,    (%%rbx)	\n\t	vmovaps	%%ymm13,0x20(%%rbx)	\n\t"/* t5 */\
		"vmovaps	%%ymm4,    (%%rcx)	\n\t	vmovaps	%%ymm12,0x20(%%rcx)	\n\t"/* t4 */\
		"vmovaps	%%ymm0,%%ymm6		\n\t	vmovaps	%%ymm8 ,%%ymm14		\n\t"/* Br0 = t0 (only show real parts in comments) */\
		"vmovaps	%%ymm0,%%ymm5		\n\t	vmovaps	%%ymm8 ,%%ymm13		\n\t"/* rt  = t0 */\
		"vmovaps	%%ymm0,%%ymm4		\n\t	vmovaps	%%ymm8 ,%%ymm12		\n\t"/* re  = t0 */\
\
		"movq	%[__cc],%%rsi		\n\t"\
		"movq	%[__o0],%%rdi		\n\t"\
		"vmovaps	0x40(%%rsi),%%ymm7		\n\t	vmovaps	0x80(%%rsi),%%ymm15		\n\t"/* cc2,cc3 */\
	"vfmadd231pd (%%rsi),%%ymm1,%%ymm5	\n\t vfmadd231pd (%%rsi),%%ymm9 ,%%ymm13	\n\t"/* rt  = FMADD(cc1,tr1, rt ); */\
	"vfmadd231pd %%ymm7 ,%%ymm1,%%ymm4	\n\t vfmadd231pd %%ymm7 ,%%ymm9 ,%%ymm12	\n\t"/* re  = FMADD(cc2,tr1, re ); */\
	"vfmadd231pd %%ymm15,%%ymm1,%%ymm0	\n\t vfmadd231pd %%ymm15,%%ymm9 ,%%ymm8 	\n\t"/* tr0 = FMADD(cc3,tr1, tr0); */\
	"vfmadd132pd (%%r9),%%ymm1,%%ymm6	\n\t vfmadd132pd (%%r9),%%ymm9 ,%%ymm14		\n\t"/* Br0 += tr1; */\
\
	"vfmadd231pd %%ymm7 ,%%ymm2,%%ymm5	\n\t vfmadd231pd %%ymm7 ,%%ymm10,%%ymm13	\n\t"/* rt  = FMADD(cc2,tr2, rt ); */\
	"vfmadd231pd %%ymm15,%%ymm2,%%ymm4	\n\t vfmadd231pd %%ymm15,%%ymm10,%%ymm12	\n\t"/* re  = FMADD(cc3,tr2, re ); */\
	"vfmadd231pd (%%rsi),%%ymm2,%%ymm0	\n\t vfmadd231pd (%%rsi),%%ymm10,%%ymm8 	\n\t"/* tr0 = FMADD(cc1,tr2, tr0); */\
	"vfmadd132pd (%%r9),%%ymm2,%%ymm6	\n\t vfmadd132pd (%%r9),%%ymm10,%%ymm14		\n\t"/* Br0 += tr2; */\
\
	"vfmadd231pd %%ymm15,%%ymm3,%%ymm5	\n\t vfmadd231pd %%ymm15,%%ymm11,%%ymm13	\n\t"/* rt  = FMADD(cc3,tr3, rt ); */\
	"vfmadd231pd (%%rsi),%%ymm3,%%ymm4	\n\t vfmadd231pd (%%rsi),%%ymm11,%%ymm12	\n\t"/* re  = FMADD(cc1,tr3, re ); */\
	"vfmadd231pd %%ymm7 ,%%ymm3,%%ymm0	\n\t vfmadd231pd %%ymm7 ,%%ymm11,%%ymm8 	\n\t"/* tr0 = FMADD(cc2,tr3, tr0); */\
	"vfmadd132pd (%%r9),%%ymm3,%%ymm6	\n\t vfmadd132pd (%%r9),%%ymm11,%%ymm14		\n\t"/* Br0 += tr3; */\
		"vmovaps	%%ymm6,    (%%rdi)	\n\t	vmovaps	%%ymm14,0x20(%%rdi)	\n\t"/* B0 */\
\
		"addq	$0x20,%%rsi		\n\t"/* Incr trig ptr: cc0 -> ss0 */\
		"vmovaps	0x40(%%rsi),%%ymm7	\n\t	vmovaps	0x80(%%rsi),%%ymm15		\n\t"/* ss2,ss3 */\
		"vmovaps		(%%rax),%%ymm1	\n\t	vmovaps	0x20(%%rax),%%ymm9 		\n\t"/* Restore: tr1 = tr6 */\
		"vmovaps		%%ymm1 ,%%ymm2	\n\t	vmovaps		%%ymm9 ,%%ymm10		\n\t"/* tr2 = tr6 */\
		"vmovaps		%%ymm1 ,%%ymm3	\n\t	vmovaps		%%ymm9 ,%%ymm11		\n\t"/* tr3 = tr6 */\
		"vmulpd	(%%rsi),%%ymm1,%%ymm1	\n\t	vmulpd (%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* tr1 = ss1*tr6; */\
		"vmulpd	%%ymm7 ,%%ymm2,%%ymm2	\n\t	vmulpd %%ymm7 ,%%ymm10,%%ymm10	\n\t"/* tr2 = ss2*tr6; */\
		"vmulpd	%%ymm15,%%ymm3,%%ymm3	\n\t	vmulpd %%ymm15,%%ymm11,%%ymm11	\n\t"/* tr3 = ss3*tr6; */\
\
		"vmovaps		(%%rbx),%%ymm6	\n\t	vmovaps	0x20(%%rbx),%%ymm14			\n\t"/* Restore t5 */\
	" vfmadd231pd %%ymm7 ,%%ymm6,%%ymm1	\n\t  vfmadd231pd %%ymm7 ,%%ymm14,%%ymm9 	\n\t"/* tr1 =  FMADD(ss2,tr5, tr1); */\
	"vfnmadd231pd %%ymm15,%%ymm6,%%ymm2	\n\t vfnmadd231pd %%ymm15,%%ymm14,%%ymm10	\n\t"/* tr2 = FNMADD(ss3,tr5, tr2); */\
	"vfnmadd231pd (%%rsi),%%ymm6,%%ymm3	\n\t vfnmadd231pd (%%rsi),%%ymm14,%%ymm11	\n\t"/* tr3 = FNMADD(ss1,tr5, tr3); */\
\
		"vmovaps		(%%rcx),%%ymm6	\n\t	vmovaps	0x20(%%rcx),%%ymm14			\n\t"/* Restore t4 */\
	" vfmadd231pd %%ymm15,%%ymm6,%%ymm1	\n\t  vfmadd231pd %%ymm15,%%ymm14,%%ymm9 	\n\t"/* tr1 =  FMADD(ss3,tr4, tr1); */\
	"vfnmadd231pd (%%rsi),%%ymm6,%%ymm2	\n\t vfnmadd231pd (%%rsi),%%ymm14,%%ymm10	\n\t"/* tr2 = FNMADD(ss1,tr4, tr2); */\
	" vfmadd231pd %%ymm7 ,%%ymm6,%%ymm3	\n\t  vfmadd231pd %%ymm7 ,%%ymm14,%%ymm11	\n\t"/* tr3 =  FMADD(ss2,tr4, tr3); */\
\
		"\n\t"\
		"movq	%[__o4],%%rdx		\n\t"\
		"movq	%[__o5],%%rsi		\n\t"\
		"movq	%[__o6],%%rdi		\n\t"\
	"vmovaps	(%%r8),%%ymm6		\n\t	vmovaps	(%%r9),%%ymm7		\n\t"/* two,one */\
	/* Output permutation causes signs to get flipped here: */\
	"vfmsub132pd %%ymm7,%%ymm9 ,%%ymm5		\n\t vfmsub132pd %%ymm7,%%ymm1 ,%%ymm13	\n\t"/* Br1 = rt  - ti1;	Bi6 = it  - tr1; */\
	"vfmsub132pd %%ymm7,%%ymm10,%%ymm4		\n\t vfmsub132pd %%ymm7,%%ymm2 ,%%ymm12	\n\t"/* Br2 = re  - ti2;	Bi5 = im  - tr2; */\
	"vfmsub132pd %%ymm7,%%ymm11,%%ymm0		\n\t vfmsub132pd %%ymm7,%%ymm3 ,%%ymm8 	\n\t"/* Br3 = tr0 - ti3;	Bi4 = ti0 - tr3; */\
	"vfmadd132pd %%ymm6,%%ymm5,%%ymm9 		\n\t vfmadd132pd %%ymm6,%%ymm13,%%ymm1 	\n\t"/* Br6 = rt  + ti1;	Bi1 = it  + tr1; */\
	"vfmadd132pd %%ymm6,%%ymm4,%%ymm10		\n\t vfmadd132pd %%ymm6,%%ymm12,%%ymm2 	\n\t"/* Br5 = re  + ti2;	Bi2 = im  + tr2; */\
	"vfmadd132pd %%ymm6,%%ymm0,%%ymm11		\n\t vfmadd132pd %%ymm6,%%ymm8 ,%%ymm3 	\n\t"/* Br4 = tr0 + ti3;	Bi3 = ti0 + tr3; */\
		"vmovaps	%%ymm5	,   (%%rax)		\n\t	vmovaps	%%ymm13,0x20(%%rdi)	\n\t"/* Br1,Bi6 */\
		"vmovaps	%%ymm4	,   (%%rbx)		\n\t	vmovaps	%%ymm12,0x20(%%rsi)	\n\t"/* Br2,Bi5 */\
		"vmovaps	%%ymm0	,   (%%rcx)		\n\t	vmovaps	%%ymm8 ,0x20(%%rdx)	\n\t"/* Br3,Bi4 */\
		"vmovaps	%%ymm9 	,   (%%rdi)		\n\t	vmovaps	%%ymm1 ,0x20(%%rax)	\n\t"/* Br6,Bi1 */\
		"vmovaps	%%ymm10	,   (%%rsi)		\n\t	vmovaps	%%ymm2 ,0x20(%%rbx)	\n\t"/* Br5,Bi2 */\
		"vmovaps	%%ymm11	,   (%%rdx)		\n\t	vmovaps	%%ymm3 ,0x20(%%rcx)	\n\t"/* Br4,Bi3 */\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__cc] "m" (Xcc)\
		 ,[__two] "m" (Xtwo)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

   #else	// ALL_FMA = False: FMAs used only for nontrivial MUL/ADD combos:

	// AVX -> FMA version: replace [88 ADD, 16 MUL, 54 memref] ==> [18 ADD, 6 MUL, 42 FMA, 58 memref].
	// I.e. trade [88 ADD, 10 MUL] for 42 FMA. FMA version also better at preserving floating-point accuracy.
	//
	#define SSE2_RADIX_07_DFT(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6, Xcc,Xtwo, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%r8		\n\t"\
		"movq	%[__i1],%%rax		\n\t"\
		"movq	%[__i2],%%rbx		\n\t"\
		"movq	%[__i3],%%rcx		\n\t"\
		"movq	%[__i4],%%rdx		\n\t"\
		"movq	%[__i5],%%rsi		\n\t"\
		"movq	%[__i6],%%rdi		\n\t"		/*** Rcol does Imaginary Parts: ***/\
		"vmovaps	(%%rax),%%ymm6		\n\t	vmovaps	0x20(%%rax),%%ymm14	\n\t"	/* x1 */\
		"vmovaps	(%%rdi),%%ymm1		\n\t	vmovaps	0x20(%%rdi),%%ymm9 	\n\t"	/* x6 */\
		"vmovaps	(%%rbx),%%ymm5		\n\t	vmovaps	0x20(%%rbx),%%ymm13	\n\t"	/* x2 */\
		"vmovaps	(%%rsi),%%ymm2		\n\t	vmovaps	0x20(%%rsi),%%ymm10	\n\t"	/* x5 */\
		"vmovaps	(%%rcx),%%ymm4		\n\t	vmovaps	0x20(%%rcx),%%ymm12	\n\t"	/* x3 */\
		"vmovaps	(%%rdx),%%ymm3		\n\t	vmovaps	0x20(%%rdx),%%ymm11	\n\t"	/* x4 */\
		"vmovaps	(%%r8),%%ymm0		\n\t"/* two */\
		"movq	%[__i0],%%rdi		\n\t"\
		"vsubpd	%%ymm1,%%ymm6,%%ymm6	\n\t	vsubpd	%%ymm9 ,%%ymm14,%%ymm14		\n\t"	/* t6 = x1 - x6 */\
		"vsubpd	%%ymm2,%%ymm5,%%ymm5	\n\t	vsubpd	%%ymm10,%%ymm13,%%ymm13		\n\t"	/* t5 = x2 - x5 */\
		"vsubpd	%%ymm3,%%ymm4,%%ymm4	\n\t	vsubpd	%%ymm11,%%ymm12,%%ymm12		\n\t"	/* t4 = x3 - x4 */\
	"vfmadd132pd %%ymm0,%%ymm6,%%ymm1	\n\t vfmadd132pd %%ymm0,%%ymm14,%%ymm9 		\n\t"	/* t1 = x1 + x6 */\
	"vfmadd132pd %%ymm0,%%ymm5,%%ymm2	\n\t vfmadd132pd %%ymm0,%%ymm13,%%ymm10		\n\t"	/* t2 = x2 + x5 */\
	"vfmadd132pd %%ymm0,%%ymm4,%%ymm3	\n\t vfmadd132pd %%ymm0,%%ymm12,%%ymm11		\n\t"	/* t3 = x3 + x4 */\
		"vmovaps	(%%rdi),%%ymm0		\n\t	vmovaps	0x20(%%rdi),%%ymm8 	\n\t"	/* t0 = x0 */\
		"movq	%[__o1],%%rax		\n\t"\
		"movq	%[__o2],%%rbx		\n\t"\
		"movq	%[__o3],%%rcx		\n\t"\
	/* Spill  xi - xj combos to o-slots; these won't be needed until we get to the sine terms: */\
		"vmovaps	%%ymm6,    (%%rax)	\n\t	vmovaps	%%ymm14,0x20(%%rax)	\n\t"/* t6 */\
		"vmovaps	%%ymm5,    (%%rbx)	\n\t	vmovaps	%%ymm13,0x20(%%rbx)	\n\t"/* t5 */\
		"vmovaps	%%ymm4,    (%%rcx)	\n\t	vmovaps	%%ymm12,0x20(%%rcx)	\n\t"/* t4 */\
		"vmovaps	%%ymm0,%%ymm6		\n\t	vmovaps	%%ymm8 ,%%ymm14		\n\t"/* Br0 = t0 (only show real parts in comments) */\
		"vmovaps	%%ymm0,%%ymm5		\n\t	vmovaps	%%ymm8 ,%%ymm13		\n\t"/* rt  = t0 */\
		"vmovaps	%%ymm0,%%ymm4		\n\t	vmovaps	%%ymm8 ,%%ymm12		\n\t"/* re  = t0 */\
\
		"movq	%[__cc],%%rsi		\n\t"\
		"movq	%[__o0],%%rdi		\n\t"\
		"vmovaps	0x40(%%rsi),%%ymm7	\n\t	vmovaps	0x80(%%rsi),%%ymm15			\n\t"/* cc2,cc3 */\
	"vfmadd231pd (%%rsi),%%ymm1,%%ymm5	\n\t vfmadd231pd (%%rsi),%%ymm9 ,%%ymm13	\n\t"/* rt  = FMADD(cc1,tr1, rt ); */\
	"vfmadd231pd %%ymm7 ,%%ymm1,%%ymm4	\n\t vfmadd231pd %%ymm7 ,%%ymm9 ,%%ymm12	\n\t"/* re  = FMADD(cc2,tr1, re ); */\
	"vfmadd231pd %%ymm15,%%ymm1,%%ymm0	\n\t vfmadd231pd %%ymm15,%%ymm9 ,%%ymm8 	\n\t"/* tr0 = FMADD(cc3,tr1, tr0); */\
		"vaddpd	%%ymm1,%%ymm6,%%ymm6	\n\t	vaddpd	%%ymm9 ,%%ymm14,%%ymm14		\n\t"/* Br0 += tr1; */\
\
	"vfmadd231pd %%ymm7 ,%%ymm2,%%ymm5	\n\t vfmadd231pd %%ymm7 ,%%ymm10,%%ymm13	\n\t"/* rt  = FMADD(cc2,tr2, rt ); */\
	"vfmadd231pd %%ymm15,%%ymm2,%%ymm4	\n\t vfmadd231pd %%ymm15,%%ymm10,%%ymm12	\n\t"/* re  = FMADD(cc3,tr2, re ); */\
	"vfmadd231pd (%%rsi),%%ymm2,%%ymm0	\n\t vfmadd231pd (%%rsi),%%ymm10,%%ymm8 	\n\t"/* tr0 = FMADD(cc1,tr2, tr0); */\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6	\n\t	vaddpd	%%ymm10,%%ymm14,%%ymm14		\n\t"/* Br0 += tr2; */\
\
	"vfmadd231pd %%ymm15,%%ymm3,%%ymm5	\n\t vfmadd231pd %%ymm15,%%ymm11,%%ymm13	\n\t"/* rt  = FMADD(cc3,tr3, rt ); */\
	"vfmadd231pd (%%rsi),%%ymm3,%%ymm4	\n\t vfmadd231pd (%%rsi),%%ymm11,%%ymm12	\n\t"/* re  = FMADD(cc1,tr3, re ); */\
	"vfmadd231pd %%ymm7 ,%%ymm3,%%ymm0	\n\t vfmadd231pd %%ymm7 ,%%ymm11,%%ymm8 	\n\t"/* tr0 = FMADD(cc2,tr3, tr0); */\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6	\n\t	vaddpd	%%ymm11,%%ymm14,%%ymm14		\n\t"/* Br0 += tr3; */\
		"vmovaps	%%ymm6,    (%%rdi)	\n\t	vmovaps	%%ymm14,0x20(%%rdi)	\n\t"/* B0 */\
\
		"addq	$0x20,%%rsi		\n\t"/* Incr trig ptr: cc0 -> ss0 */\
		"vmovaps	0x40(%%rsi),%%ymm7	\n\t	vmovaps	0x80(%%rsi),%%ymm15		\n\t"/* ss2,ss3 */\
		"vmovaps		(%%rax),%%ymm1	\n\t	vmovaps	0x20(%%rax),%%ymm9 		\n\t"/* Restore: tr1 = tr6 */\
		"vmovaps		%%ymm1 ,%%ymm2	\n\t	vmovaps		%%ymm9 ,%%ymm10		\n\t"/* tr2 = tr6 */\
		"vmovaps		%%ymm1 ,%%ymm3	\n\t	vmovaps		%%ymm9 ,%%ymm11		\n\t"/* tr3 = tr6 */\
		"vmulpd (%%rsi),%%ymm1,%%ymm1	\n\t	vmulpd (%%rsi),%%ymm9 ,%%ymm9 	\n\t"/* tr1 = ss1*tr6; */\
		"vmulpd %%ymm7 ,%%ymm2,%%ymm2	\n\t	vmulpd %%ymm7 ,%%ymm10,%%ymm10	\n\t"/* tr2 = ss2*tr6; */\
		"vmulpd %%ymm15,%%ymm3,%%ymm3	\n\t	vmulpd %%ymm15,%%ymm11,%%ymm11	\n\t"/* tr3 = ss3*tr6; */\
\
		"vmovaps		(%%rbx),%%ymm6	\n\t	vmovaps	0x20(%%rbx),%%ymm14			\n\t"/* Restore t5 */\
	" vfmadd231pd %%ymm7 ,%%ymm6,%%ymm1	\n\t  vfmadd231pd %%ymm7 ,%%ymm14,%%ymm9 	\n\t"/* tr1 =  FMADD(ss2,tr5, tr1); */\
	"vfnmadd231pd %%ymm15,%%ymm6,%%ymm2	\n\t vfnmadd231pd %%ymm15,%%ymm14,%%ymm10	\n\t"/* tr2 = FNMADD(ss3,tr5, tr2); */\
	"vfnmadd231pd (%%rsi),%%ymm6,%%ymm3	\n\t vfnmadd231pd (%%rsi),%%ymm14,%%ymm11	\n\t"/* tr3 = FNMADD(ss1,tr5, tr3); */\
\
		"vmovaps		(%%rcx),%%ymm6	\n\t	vmovaps	0x20(%%rcx),%%ymm14			\n\t"/* Restore t4 */\
	" vfmadd231pd %%ymm15,%%ymm6,%%ymm1	\n\t  vfmadd231pd %%ymm15,%%ymm14,%%ymm9 	\n\t"/* tr1 =  FMADD(ss3,tr4, tr1); */\
	"vfnmadd231pd (%%rsi),%%ymm6,%%ymm2	\n\t vfnmadd231pd (%%rsi),%%ymm14,%%ymm10	\n\t"/* tr2 = FNMADD(ss1,tr4, tr2); */\
	" vfmadd231pd %%ymm7 ,%%ymm6,%%ymm3	\n\t  vfmadd231pd %%ymm7 ,%%ymm14,%%ymm11	\n\t"/* tr3 =  FMADD(ss2,tr4, tr3); */\
\
		"\n\t"\
		"movq	%[__o4],%%rdx		\n\t"\
		"movq	%[__o5],%%rsi		\n\t"\
		"movq	%[__o6],%%rdi		\n\t"\
		"vmovaps	(%%r8),%%ymm6	\n\t"/* two */\
	/* Output permutation causes signs to get flipped here: */\
		"vsubpd	%%ymm9 ,%%ymm5,%%ymm5		\n\t	vsubpd	%%ymm1 ,%%ymm13,%%ymm13	\n\t"/* Br1 = rt  - ti1;	Bi6 = it  - tr1; */\
		"vsubpd	%%ymm10,%%ymm4,%%ymm4		\n\t	vsubpd	%%ymm2 ,%%ymm12,%%ymm12	\n\t"/* Br2 = re  - ti2;	Bi5 = im  - tr2; */\
		"vsubpd	%%ymm11,%%ymm0,%%ymm0		\n\t	vsubpd	%%ymm3 ,%%ymm8 ,%%ymm8 	\n\t"/* Br3 = tr0 - ti3;	Bi4 = ti0 - tr3; */\
	"vfmadd132pd %%ymm6,%%ymm5,%%ymm9 		\n\t vfmadd132pd %%ymm6,%%ymm13,%%ymm1 	\n\t"/* Br6 = rt  + ti1;	Bi1 = it  + tr1; */\
	"vfmadd132pd %%ymm6,%%ymm4,%%ymm10		\n\t vfmadd132pd %%ymm6,%%ymm12,%%ymm2 	\n\t"/* Br5 = re  + ti2;	Bi2 = im  + tr2; */\
	"vfmadd132pd %%ymm6,%%ymm0,%%ymm11		\n\t vfmadd132pd %%ymm6,%%ymm8 ,%%ymm3 	\n\t"/* Br4 = tr0 + ti3;	Bi3 = ti0 + tr3; */\
		"vmovaps	%%ymm5	,   (%%rax)		\n\t	vmovaps	%%ymm13,0x20(%%rdi)	\n\t"/* Br1,Bi6 */\
		"vmovaps	%%ymm4	,   (%%rbx)		\n\t	vmovaps	%%ymm12,0x20(%%rsi)	\n\t"/* Br2,Bi5 */\
		"vmovaps	%%ymm0	,   (%%rcx)		\n\t	vmovaps	%%ymm8 ,0x20(%%rdx)	\n\t"/* Br3,Bi4 */\
		"vmovaps	%%ymm9 	,   (%%rdi)		\n\t	vmovaps	%%ymm1 ,0x20(%%rax)	\n\t"/* Br6,Bi1 */\
		"vmovaps	%%ymm10	,   (%%rsi)		\n\t	vmovaps	%%ymm2 ,0x20(%%rbx)	\n\t"/* Br5,Bi2 */\
		"vmovaps	%%ymm11	,   (%%rdx)		\n\t	vmovaps	%%ymm3 ,0x20(%%rcx)	\n\t"/* Br4,Bi3 */\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__cc] "m" (Xcc)\
		 ,[__two] "m" (Xtwo)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

   #endif	// ALL_FMA ?

	/* Twiddleless version of SSE2_RADIX8_DIF_TWIDDLE. Inputs enter in memory locations __r0 + [__i1,__i2,__i3,__i4,__i5,__i6,__i7],;
	where r0 is a memory address and the i's are LITERAL [BYTE] OFFSETS. Outputs go into memory locations __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7, assumed disjoint with inputs:\
	*/
	#define SSE2_RADIX8_DIF_0TWIDDLE(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2,Xtwo)\
	{\
	__asm__ volatile (\
	/* 1st of 2 radix-4 subtransforms, data in ymm0-7: */	/* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\
		"movq	%[__isrt2],%%rsi				\n\t		movq	%[__two],%%r9	\n\t"/* r9 holds 2.0 throughout */\
		"movq	%[__r0],%%rax	/* i0 = r00 */	\n\t		leaq	%c[__i1](%%rax),%%r10	/* i1 */\n\t"\
		"leaq	%c[__i2](%%rax),%%rbx			\n\t		leaq	%c[__i3](%%rax),%%r11	/* i3 */\n\t"\
		"leaq	%c[__i4](%%rax),%%rcx			\n\t		leaq	%c[__i5](%%rax),%%r12	/* i5 */\n\t"\
		"leaq	%c[__i6](%%rax),%%rdx			\n\t		leaq	%c[__i7](%%rax),%%r13	/* i7 */\n\t"\
	/* p0,4 combo: x+-y into ymm0/1, 2/3, resp: */		/* p1,5 combo: x+y into ymm8 /1, x-y in ymm10/3: */\
	/* p2,6 combo: x+-y into ymm4/5, 6/7, resp: */		/* p3,7 combo: x+y into ymm14/7, x-y in ymm12/5: */\
		"vmovaps	     (%%rcx),%%ymm0			\n\t		vmovaps	     (%%r12),%%ymm8 			\n\t"\
		"vmovaps	0x020(%%rcx),%%ymm1			\n\t		vmovaps	0x020(%%r12),%%ymm9 			\n\t"\
		"vmovaps	     (%%rax),%%ymm2			\n\t		vmovaps	     (%%r10),%%ymm10			\n\t"\
		"vmovaps	0x020(%%rax),%%ymm3			\n\t		vmovaps	0x020(%%r10),%%ymm11			\n\t"\
		"vmovaps	     (%%rdx),%%ymm4			\n\t		vmovaps	     (%%r11),%%ymm12			\n\t"\
		"vmovaps	0x020(%%rdx),%%ymm5			\n\t		vmovaps	0x020(%%r11),%%ymm13			\n\t"\
		"vmovaps	     (%%rbx),%%ymm6			\n\t		vmovaps	     (%%r13),%%ymm14			\n\t"\
		"vmovaps	0x020(%%rbx),%%ymm7			\n\t		vmovaps	0x020(%%r13),%%ymm15			\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2			\n\t		vsubpd	%%ymm8 ,%%ymm10,%%ymm10			\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3			\n\t		vsubpd	%%ymm9 ,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6			\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7			\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13			\n\t"\
	"vmovaps	%%ymm15,(%%rax) 	\n\t"/* spill ymm15 to make room for 2.0 */"	vmovaps	 (%%r9),%%ymm15	\n\t"/* two */\
	"vfmadd132pd	%%ymm15,%%ymm2,%%ymm0		\n\t	vfmadd132pd	%%ymm15,%%ymm10,%%ymm8 	\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm3,%%ymm1		\n\t	vfmadd132pd	%%ymm15,%%ymm11,%%ymm9 	\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm6,%%ymm4		\n\t	vfmadd132pd	%%ymm15,%%ymm12,%%ymm14	\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm7,%%ymm5		\n\t	vfmadd132pd	(%%rax),%%ymm13,%%ymm15	\n\t"\
		/* Finish radix-4 butterfly and store results into temporary-array slots: */\
		"vsubpd		%%ymm4,%%ymm0,%%ymm0		\n\t		vsubpd		%%ymm14,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd		%%ymm5,%%ymm1,%%ymm1		\n\t		vsubpd		%%ymm15,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd		%%ymm7,%%ymm2,%%ymm2		\n\t		vsubpd		%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vsubpd		%%ymm6,%%ymm3,%%ymm3		\n\t		vsubpd		%%ymm12,%%ymm11,%%ymm11		\n\t"\
	"vmovaps	%%ymm12,(%%rax) 	\n\t"/* spill ymm12 to make room for 2.0 */"	vmovaps	 (%%r9),%%ymm12	\n\t"/* two */\
	"vfmadd132pd	%%ymm12,%%ymm0,%%ymm4		\n\t	vfmadd132pd		%%ymm12,%%ymm8 ,%%ymm14		\n\t"\
	"vfmadd132pd	%%ymm12,%%ymm1,%%ymm5		\n\t	vfmadd132pd		%%ymm12,%%ymm9 ,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm12,%%ymm2,%%ymm7		\n\t	vfmadd132pd		%%ymm12,%%ymm10,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm12,%%ymm3,%%ymm6		\n\t	vfmadd132pd		(%%rax),%%ymm11,%%ymm12		\n\t"\
		"													vsubpd		%%ymm12,%%ymm10,%%ymm10		\n\t"\
		"													vsubpd		%%ymm11,%%ymm13,%%ymm13		\n\t"\
		"												vfmadd132pd		(%%r9 ),%%ymm10,%%ymm12		\n\t"/* .two */\
		"												vfmadd132pd		(%%r9 ),%%ymm13,%%ymm11		\n\t"\
		/* SSE2_RADIX8_DIF_COMBINE_RAD4_SUBS(r00,r10,r20,r30,r08,r18,r28,r38): */\
		"\n\t"\
		"movq	%[__o0],%%rax					\n\t		movq	%[__o4],%%r10				\n\t"\
		"movq	%[__o1],%%rbx					\n\t		movq	%[__o5],%%r11				\n\t"\
		"movq	%[__o2],%%rcx					\n\t		movq	%[__o6],%%r12				\n\t"\
		"movq	%[__o3],%%rdx					\n\t		movq	%[__o7],%%r13				\n\t"\
		/* Combine r00,r08,r20,r28: */						/* Combine r10,r18,r30,r38: */\
		"vsubpd		%%ymm14,%%ymm4 ,%%ymm4 		\n\t	vfnmadd231pd	(%%rsi),%%ymm10,%%ymm2 		\n\t"/* .isrt2 */\
		"vsubpd		%%ymm9 ,%%ymm0 ,%%ymm0 		\n\t	vfnmadd231pd	(%%rsi),%%ymm13,%%ymm3 		\n\t"\
		"vsubpd		%%ymm15,%%ymm5 ,%%ymm5 		\n\t	vfnmadd231pd	(%%rsi),%%ymm12,%%ymm6 		\n\t"\
		"vsubpd		%%ymm8 ,%%ymm1 ,%%ymm1 		\n\t	vfnmadd231pd	(%%rsi),%%ymm11,%%ymm7 		\n\t"\
	"vmovaps	%%ymm8 ,(%%rax) 	\n\t"/* spill ymm8  to make room for 2.0 */"	vmovaps	 (%%r9),%%ymm8 	\n\t"/* two */\
	"vmovaps	%%ymm11,(%%r10) 	\n\t"/* spill ymm11 to make room for sqrt2 */"	vmovaps	0x40(%%r9),%%ymm11 \n\t"/* sqrt2 */\
		"vmovaps	%%ymm4 ,    (%%rbx)			\n\t		vmovaps	%%ymm2 ,    (%%r11)			\n\t"\
		"vmovaps	%%ymm0 ,    (%%rcx)			\n\t		vmovaps	%%ymm3 ,0x20(%%r13)			\n\t"\
		"vmovaps	%%ymm5 ,0x20(%%rbx)			\n\t		vmovaps	%%ymm6 ,0x20(%%r11)			\n\t"\
		"vmovaps	%%ymm1 ,0x20(%%rdx)			\n\t		vmovaps	%%ymm7 ,    (%%r12)			\n\t"\
	"vfmadd132pd	%%ymm8 ,%%ymm4 ,%%ymm14		\n\t	vfmadd132pd		%%ymm11,%%ymm2 ,%%ymm10		\n\t"\
	"vfmadd132pd	%%ymm8 ,%%ymm0 ,%%ymm9 		\n\t	vfmadd132pd		%%ymm11,%%ymm3 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm8 ,%%ymm5 ,%%ymm15		\n\t	vfmadd132pd		%%ymm11,%%ymm6 ,%%ymm12		\n\t"\
	"vfmadd132pd	(%%rax),%%ymm1 ,%%ymm8 		\n\t	vfmadd132pd		(%%r10),%%ymm7 ,%%ymm11		\n\t"\
		"vmovaps	%%ymm14,    (%%rax)			\n\t		vmovaps	%%ymm10,    (%%r10)			\n\t"\
		"vmovaps	%%ymm9 ,    (%%rdx)			\n\t		vmovaps	%%ymm13,0x20(%%r12)			\n\t"\
		"vmovaps	%%ymm15,0x20(%%rax)			\n\t		vmovaps	%%ymm12,0x20(%%r10)			\n\t"\
		"vmovaps	%%ymm8 ,0x20(%%rcx)			\n\t		vmovaps	%%ymm11,    (%%r13)			\n\t"\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__i5] "e" (Xi5)\
		 ,[__i6] "e" (Xi6)\
		 ,[__i7] "e" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__two] "m" (Xtwo)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Need a 2nd version of above which takes the i-strides as intvars rather than literal bytes:
	#define SSE2_RADIX8_DIF_0TWIDDLE_B(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2,Xtwo)\
	{\
	__asm__ volatile (\
	/* 1st of 2 radix-4 subtransforms, data in ymm0-7: */	/* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\
		"movq	%[__isrt2],%%rsi				\n\t		movq	%[__two],%%r9	\n\t"/* r9 holds 2.0 throughout */\
		"movq	%[__r0],%%rax	/* i0 = r00 */	\n\t		movslq	%[__i1],%%r10		/* i1 */	\n\t"\
		"movslq	%[__i2],%%rbx	/* i2 */		\n\t		movslq	%[__i3],%%r11		/* i3 */	\n\t"\
		"movslq	%[__i4],%%rcx	/* i4 */		\n\t		movslq	%[__i5],%%r12		/* i5 */	\n\t"\
		"movslq	%[__i6],%%rdx	/* i6 */		\n\t		movslq	%[__i7],%%r13		/* i7 */	\n\t"\
		"addq	%%rax,%%rbx						\n\t		addq	%%rax,%%r10						\n\t"\
		"addq	%%rax,%%rcx						\n\t		addq	%%rax,%%r11						\n\t"\
		"addq	%%rax,%%rdx						\n\t		addq	%%rax,%%r12						\n\t"\
		"													addq	%%rax,%%r13						\n\t"\
	/* p0,4 combo: x+-y into ymm0/1, 2/3, resp: */		/* p1,5 combo: x+y into ymm8 /1, x-y in ymm10/3: */\
	/* p2,6 combo: x+-y into ymm4/5, 6/7, resp: */		/* p3,7 combo: x+y into ymm14/7, x-y in ymm12/5: */\
		"vmovaps	     (%%rcx),%%ymm0			\n\t		vmovaps	     (%%r12),%%ymm8 			\n\t"\
		"vmovaps	0x020(%%rcx),%%ymm1			\n\t		vmovaps	0x020(%%r12),%%ymm9 			\n\t"\
		"vmovaps	     (%%rax),%%ymm2			\n\t		vmovaps	     (%%r10),%%ymm10			\n\t"\
		"vmovaps	0x020(%%rax),%%ymm3			\n\t		vmovaps	0x020(%%r10),%%ymm11			\n\t"\
		"vmovaps	     (%%rdx),%%ymm4			\n\t		vmovaps	     (%%r11),%%ymm12			\n\t"\
		"vmovaps	0x020(%%rdx),%%ymm5			\n\t		vmovaps	0x020(%%r11),%%ymm13			\n\t"\
		"vmovaps	     (%%rbx),%%ymm6			\n\t		vmovaps	     (%%r13),%%ymm14			\n\t"\
		"vmovaps	0x020(%%rbx),%%ymm7			\n\t		vmovaps	0x020(%%r13),%%ymm15			\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2			\n\t		vsubpd	%%ymm8 ,%%ymm10,%%ymm10			\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3			\n\t		vsubpd	%%ymm9 ,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6			\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7			\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13			\n\t"\
	"vmovaps	%%ymm15,(%%rax) 	\n\t"/* spill ymm15 to make room for 2.0 */"	vmovaps	 (%%r9),%%ymm15	\n\t"/* two */\
	"vfmadd132pd	%%ymm15,%%ymm2,%%ymm0		\n\t	vfmadd132pd	%%ymm15,%%ymm10,%%ymm8 	\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm3,%%ymm1		\n\t	vfmadd132pd	%%ymm15,%%ymm11,%%ymm9 	\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm6,%%ymm4		\n\t	vfmadd132pd	%%ymm15,%%ymm12,%%ymm14	\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm7,%%ymm5		\n\t	vfmadd132pd	(%%rax),%%ymm13,%%ymm15	\n\t"\
		/* Finish radix-4 butterfly and store results into temporary-array slots: */\
		"vsubpd		%%ymm4,%%ymm0,%%ymm0		\n\t		vsubpd		%%ymm14,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd		%%ymm5,%%ymm1,%%ymm1		\n\t		vsubpd		%%ymm15,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd		%%ymm7,%%ymm2,%%ymm2		\n\t		vsubpd		%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vsubpd		%%ymm6,%%ymm3,%%ymm3		\n\t		vsubpd		%%ymm12,%%ymm11,%%ymm11		\n\t"\
	"vmovaps	%%ymm12,(%%rax) 	\n\t"/* spill ymm12 to make room for 2.0 */"	vmovaps	 (%%r9),%%ymm12	\n\t"/* two */\
	"vfmadd132pd	%%ymm12,%%ymm0,%%ymm4		\n\t	vfmadd132pd		%%ymm12,%%ymm8 ,%%ymm14		\n\t"\
	"vfmadd132pd	%%ymm12,%%ymm1,%%ymm5		\n\t	vfmadd132pd		%%ymm12,%%ymm9 ,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm12,%%ymm2,%%ymm7		\n\t	vfmadd132pd		%%ymm12,%%ymm10,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm12,%%ymm3,%%ymm6		\n\t	vfmadd132pd		(%%rax),%%ymm11,%%ymm12		\n\t"\
		"													vsubpd		%%ymm12,%%ymm10,%%ymm10		\n\t"\
		"													vsubpd		%%ymm11,%%ymm13,%%ymm13		\n\t"\
		"												vfmadd132pd		(%%r9 ),%%ymm10,%%ymm12		\n\t"/* .two */\
		"												vfmadd132pd		(%%r9 ),%%ymm13,%%ymm11		\n\t"\
		/* SSE2_RADIX8_DIF_COMBINE_RAD4_SUBS(r00,r10,r20,r30,r08,r18,r28,r38): */\
		"\n\t"\
		"movq	%[__o0],%%rax					\n\t		movq	%[__o4],%%r10				\n\t"\
		"movq	%[__o1],%%rbx					\n\t		movq	%[__o5],%%r11				\n\t"\
		"movq	%[__o2],%%rcx					\n\t		movq	%[__o6],%%r12				\n\t"\
		"movq	%[__o3],%%rdx					\n\t		movq	%[__o7],%%r13				\n\t"\
		/* Combine r00,r08,r20,r28: */						/* Combine r10,r18,r30,r38: */\
		"vsubpd		%%ymm14,%%ymm4 ,%%ymm4 		\n\t	vfnmadd231pd	(%%rsi),%%ymm10,%%ymm2 		\n\t"/* .isrt2 */\
		"vsubpd		%%ymm9 ,%%ymm0 ,%%ymm0 		\n\t	vfnmadd231pd	(%%rsi),%%ymm13,%%ymm3 		\n\t"\
		"vsubpd		%%ymm15,%%ymm5 ,%%ymm5 		\n\t	vfnmadd231pd	(%%rsi),%%ymm12,%%ymm6 		\n\t"\
		"vsubpd		%%ymm8 ,%%ymm1 ,%%ymm1 		\n\t	vfnmadd231pd	(%%rsi),%%ymm11,%%ymm7 		\n\t"\
	"vmovaps	%%ymm8 ,(%%rax) 	\n\t"/* spill ymm8  to make room for 2.0 */"	vmovaps	 (%%r9),%%ymm8 	\n\t"/* two */\
	"vmovaps	%%ymm11,(%%r10) 	\n\t"/* spill ymm11 to make room for sqrt2 */"	vmovaps	0x40(%%r9),%%ymm11 \n\t"/* sqrt2 */\
		"vmovaps	%%ymm4 ,    (%%rbx)			\n\t		vmovaps	%%ymm2 ,    (%%r11)			\n\t"\
		"vmovaps	%%ymm0 ,    (%%rcx)			\n\t		vmovaps	%%ymm3 ,0x20(%%r13)			\n\t"\
		"vmovaps	%%ymm5 ,0x20(%%rbx)			\n\t		vmovaps	%%ymm6 ,0x20(%%r11)			\n\t"\
		"vmovaps	%%ymm1 ,0x20(%%rdx)			\n\t		vmovaps	%%ymm7 ,    (%%r12)			\n\t"\
	"vfmadd132pd	%%ymm8 ,%%ymm4 ,%%ymm14		\n\t	vfmadd132pd		%%ymm11,%%ymm2 ,%%ymm10		\n\t"\
	"vfmadd132pd	%%ymm8 ,%%ymm0 ,%%ymm9 		\n\t	vfmadd132pd		%%ymm11,%%ymm3 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm8 ,%%ymm5 ,%%ymm15		\n\t	vfmadd132pd		%%ymm11,%%ymm6 ,%%ymm12		\n\t"\
	"vfmadd132pd	(%%rax),%%ymm1 ,%%ymm8 		\n\t	vfmadd132pd		(%%r10),%%ymm7 ,%%ymm11		\n\t"\
		"vmovaps	%%ymm14,    (%%rax)			\n\t		vmovaps	%%ymm10,    (%%r10)			\n\t"\
		"vmovaps	%%ymm9 ,    (%%rdx)			\n\t		vmovaps	%%ymm13,0x20(%%r12)			\n\t"\
		"vmovaps	%%ymm15,0x20(%%rax)			\n\t		vmovaps	%%ymm12,0x20(%%r10)			\n\t"\
		"vmovaps	%%ymm8 ,0x20(%%rcx)			\n\t		vmovaps	%%ymm11,    (%%r13)			\n\t"\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__two] "m" (Xtwo)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// AVX2 analog of dft_macro.h::RADIX_08_DIF_TWIDDLE_OOP - Result of adding separate I/O addressing to
	// radix8_dif_dit_pass_gcc64.h::SSE2_RADIX8_DIF_TWIDDLE.
	//*** Dec 2020: For guide to updated reduced-#arg IO address computation, cf. the SSE2 version of this macro. ***
	//
	// [rsi] (and if needed rdi) points to sine components of each sincos pair, which is not really a pair here in terms of relative addressing.
	//
	#define SSE2_RADIX8_DIF_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xoff, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
		"xorq	%%r8,%%r8	\n\t	leaq	%c[i1](%%r8),%%r8	\n\t"/* movq|movslq of literal %c[i1] both segfaulted, workaround via LEA */\
		"movq	%[in0],%%rax		\n\t	leaq	(%%rax,%%r8,4),%%r10	\n\t"/* [lcol,rcol] base-addresses = in0 + [0,4*istride] */\
		"movq	%[twid_ptrs],%%rsi	\n\t"\
		"leaq	(%%rax,%%r8),%%rbx	\n\t	leaq	(%%r10,%%r8  ),%%r11	\n\t"\
		/* The twid_ptrs[] array holds ptrs to 14 complex twiddles in BR order: (c,s)[4,2,6,1,5,3,7]: */\
		"movq	0x30(%%rsi),%%r12	\n\t	movq	0x40(%%rsi),%%r14	\n\t	movq	    (%%rsi),%%rcx	\n\t"/* c1,5,4 */\
		"movq	0x38(%%rsi),%%r13	\n\t	movq	0x48(%%rsi),%%r15	\n\t	movq	0x08(%%rsi),%%rsi	\n\t"/* s1,5,4 ... do c4,s4 last because s4-result overwrites rsi */\
		"vmovaps	    (%%rbx)	,%%ymm2			\n\t		vmovaps	    (%%r10)	,%%ymm8 			\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm3			\n\t		vmovaps	0x20(%%r10)	,%%ymm10			\n\t"\
		"vmovaps		%%ymm2	,%%ymm4			\n\t		vmovaps		%%ymm8 	,%%ymm9 			\n\t"\
		"vmovaps		%%ymm3	,%%ymm5			\n\t		vmovaps		%%ymm10	,%%ymm11			\n\t"\
		"vmulpd		(%%rcx)	,%%ymm2,%%ymm2		\n\t		vmulpd		(%%r12)	,%%ymm8 ,%%ymm8 	\n\t"\
		"vmulpd		(%%rcx)	,%%ymm3,%%ymm3		\n\t		vmulpd		(%%r13)	,%%ymm9 ,%%ymm9 	\n\t"\
	"vfnmadd231pd	(%%rsi)	,%%ymm5,%%ymm2		\n\t	vfnmadd231pd	(%%r13)	,%%ymm10,%%ymm8 	\n\t"\
	" vfmadd231pd	(%%rsi)	,%%ymm4,%%ymm3		\n\t	 vfmadd231pd	(%%r12)	,%%ymm11,%%ymm9 	\n\t"\
		"vmovaps	    (%%rax)	,%%ymm0			\n\t		vmovaps	    (%%r11)	,%%ymm10			\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm1			\n\t		vmovaps	0x20(%%r11)	,%%ymm11			\n\t"\
		"vmovaps		%%ymm0	,%%ymm6			\n\t		vmovaps	    (%%r11)	,%%ymm12			\n\t"\
		"vmovaps		%%ymm1	,%%ymm7			\n\t		vmovaps	0x20(%%r11)	,%%ymm13			\n\t"\
		"vaddpd	%%ymm2		,%%ymm0,%%ymm0		\n\t		vmulpd		(%%r14)	,%%ymm10,%%ymm10	\n\t"\
		"vaddpd	%%ymm3		,%%ymm1,%%ymm1		\n\t		vmulpd		(%%r15)	,%%ymm12,%%ymm12	\n\t"\
		"vsubpd	%%ymm2		,%%ymm6,%%ymm6		\n\t	vfnmadd231pd	(%%r15)	,%%ymm11,%%ymm10	\n\t	vmovaps	%%ymm10,%%ymm11	\n\t"\
		"vsubpd	%%ymm3		,%%ymm7,%%ymm7		\n\t	 vfmadd231pd	(%%r14)	,%%ymm13,%%ymm12	\n\t	vmovaps	%%ymm12,%%ymm13	\n\t"\
		"vmovaps	%%ymm0		,    (%%rax)	\n\t		vaddpd	%%ymm8 		,%%ymm10,%%ymm10	\n\t"\
		"vmovaps	%%ymm1		,0x20(%%rax)	\n\t		vsubpd	%%ymm11		,%%ymm8 ,%%ymm8 	\n\t"\
		"vmovaps	%%ymm6		,    (%%rbx)	\n\t		vaddpd	%%ymm9 		,%%ymm12,%%ymm12	\n\t"\
		"vmovaps	%%ymm7		,0x20(%%rbx)	\n\t		vsubpd	%%ymm13		,%%ymm9 ,%%ymm9 	\n\t"\
		"leaq	(%%rax,%%r8,2),%%rax			\n\t		vmovaps	%%ymm10		,    (%%r10)	\n\t"\
		"leaq	(%%rbx,%%r8,2),%%rbx			\n\t		vmovaps	%%ymm12		,0x20(%%r10)	\n\t"\
		"movq	%[twid_ptrs],%%r15				\n\t"\
		"movq		0x10(%%r15),%%rcx			\n\t		vmovaps	%%ymm8 		,    (%%r11)	\n\t"/* c2 */\
		"movq		0x20(%%r15),%%rdx			\n\t		vmovaps	%%ymm9 		,0x20(%%r11)	\n\t"/* c6 */\
		"movq		0x18(%%r15),%%rsi	\n\t	leaq	(%%r10,%%r8,2),%%r10	\n\t	movq 0x50(%%r15),%%r12 \n\t movq 0x58(%%r15),%%r13	\n\t"/* s2, c3,s3 */\
		"movq		0x28(%%r15),%%rdi	\n\t	leaq	(%%r11,%%r8,2),%%r11	\n\t	movq 0x60(%%r15),%%r14 \n\t movq 0x68(%%r15),%%r15	\n\t"/* s6, c7,s7 */\
		"vmovaps	    (%%rax)	,%%ymm0			\n\t		vmovaps	    (%%r10)	,%%ymm8 			\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm2			\n\t		vmovaps	0x20(%%r10)	,%%ymm10			\n\t"\
		"vmovaps		%%ymm0	,%%ymm1			\n\t		vmovaps		%%ymm8 	,%%ymm9 			\n\t"\
		"vmovaps		%%ymm2	,%%ymm3			\n\t		vmovaps		%%ymm10	,%%ymm11			\n\t"\
		"vmulpd		(%%rcx)	,%%ymm0,%%ymm0		\n\t		vmulpd		(%%r12)	,%%ymm8 ,%%ymm8 	\n\t"\
		"vmulpd		(%%rsi)	,%%ymm1,%%ymm1		\n\t		vmulpd		(%%r13)	,%%ymm9 ,%%ymm9 	\n\t"\
	"vfnmadd231pd	(%%rsi)	,%%ymm2,%%ymm0		\n\t	vfnmadd231pd	(%%r13)	,%%ymm10,%%ymm8 	\n\t"\
	" vfmadd231pd	(%%rcx)	,%%ymm3,%%ymm1		\n\t	 vfmadd231pd	(%%r12)	,%%ymm11,%%ymm9 	\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm2			\n\t		vmovaps	    (%%r11)	,%%ymm10			\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm3			\n\t		vmovaps	0x20(%%r11)	,%%ymm11			\n\t"\
		"vmovaps		%%ymm2	,%%ymm4			\n\t		vmovaps		%%ymm10	,%%ymm12			\n\t"\
		"vmovaps		%%ymm3	,%%ymm5			\n\t		vmovaps		%%ymm11	,%%ymm13			\n\t"\
		"vmulpd		(%%rdx)	,%%ymm2,%%ymm2		\n\t		vmulpd		(%%r14)	,%%ymm10,%%ymm10	\n\t"\
		"vmulpd		(%%rdi)	,%%ymm4,%%ymm4		\n\t		vmulpd		(%%r15)	,%%ymm12,%%ymm12	\n\t"\
	"vfnmadd231pd	(%%rdi)	,%%ymm3,%%ymm2		\n\t	vfnmadd231pd	(%%r15)	,%%ymm11,%%ymm10	\n\t"\
	" vfmadd231pd	(%%rdx)	,%%ymm5,%%ymm4		\n\t	 vfmadd231pd	(%%r14)	,%%ymm13,%%ymm12	\n\t"\
		"vmovaps	%%ymm2		,%%ymm3			\n\t		vmovaps	%%ymm10		,%%ymm11			\n\t"\
		"vmovaps	%%ymm4		,%%ymm5			\n\t		vmovaps	%%ymm12		,%%ymm13			\n\t"\
		"vaddpd	%%ymm0		,%%ymm2,%%ymm2		\n\t		vaddpd	%%ymm8 		,%%ymm10,%%ymm10	\n\t"\
		"vsubpd	%%ymm3		,%%ymm0,%%ymm0		\n\t		vsubpd	%%ymm11		,%%ymm8 ,%%ymm8 	\n\t"\
		"vaddpd	%%ymm1		,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm9 		,%%ymm12,%%ymm12	\n\t"\
		"vsubpd	%%ymm5		,%%ymm1,%%ymm1		\n\t		vsubpd	%%ymm13		,%%ymm9 ,%%ymm9 	\n\t"\
		"vmovaps	%%ymm2		,    (%%rax)	\n\t		vmovaps	%%ymm10		,    (%%r10)	\n\t"\
		"vmovaps	%%ymm4		,0x20(%%rax)	\n\t		vmovaps	%%ymm12		,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm0		,    (%%rbx)	\n\t		vmovaps	%%ymm8 		,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1		,0x20(%%rbx)	\n\t		vmovaps	%%ymm9 		,0x20(%%r11)	\n\t"\
	/* combine to get 2 length-4 output subtransforms.
	In this step 2 of the 8-dft, we need address-pairs
		lcol:		rcol:
		i0,2,1,3	i4,6,5,7
		o0,2,1,3	o4,6,5,7
	At this point r[a|b]x have i2,3, r1[0|1] have i6,7, but cleaner to reload add0 and go from there.
	Since we will be loading o-addresses into regs starting with r[a|b]x and r1[0|1], use r[c|d]x and r1[2|3]
	for the I-address pairs here: */\
	"movq	%[in0],%%rcx			\n\t		leaq	(%%rcx,%%r8  ),%%r12	\n\t"/* [lcol,rcol] base-addresses = in0 + [0,1*istride] */\
	"leaq	(%%rcx,%%r8,2),%%rdx	\n\t		leaq	(%%r12,%%r8,2),%%r13	\n\t"/* in0 + [2,3*istride] */\
		"vmovaps	    (%%rdx)	,%%ymm0			\n\t		vmovaps	    (%%r13)	,%%ymm9 			\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm1			\n\t		vmovaps	0x20(%%r13)	,%%ymm12			\n\t"\
		"vmovaps	    (%%rcx)	,%%ymm4			\n\t		vmovaps	    (%%r12)	,%%ymm8 			\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm5			\n\t		vmovaps	0x20(%%r12)	,%%ymm13			\n\t"\
	"shlq	$2,%%r8			\n\t"/* From here on only need offset i4 = 4*i1 */\
	"addq	%%r8,%%rdx				\n\t		addq	%%r8,%%r13				\n\t"/* in0 + [6,7*istride] */\
	"addq	%%r8,%%rcx				\n\t		addq	%%r8,%%r12				\n\t"/* in0 + [4,5*istride] */\
		"vmovaps	    (%%rdx)	,%%ymm2			\n\t		vmovaps	    (%%r13)	,%%ymm11			\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm3			\n\t		vmovaps	0x20(%%r13)	,%%ymm14			\n\t"\
		"vmovaps	    (%%rcx)	,%%ymm6			\n\t		vmovaps	    (%%r12)	,%%ymm10			\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm7			\n\t		vmovaps	0x20(%%r12)	,%%ymm15			\n\t"\
	"movq	%[out0]	,%%rsi			\n\t	movq	%[off]	,%%rdi			\n\t"/* Load output base-address into rsi and offset-array pointer into rdi */\
	"movslq		    (%%rdi),%%rax	\n\t	movslq		0x10(%%rdi),%%r10	\n\t"/*        off[0,4] */\
	"leaq	(%%rsi,%%rax,8),%%rax	\n\t	leaq	(%%rsi,%%r10,8),%%r10	\n\t"/* out0 + off[0,4] */\
	"movslq		0x08(%%rdi),%%rbx	\n\t	movslq		0x18(%%rdi),%%r11	\n\t"\
	"leaq	(%%rsi,%%rbx,8),%%rbx	\n\t	leaq	(%%rsi,%%r11,8),%%r11	\n\t"/* out0 + off[2,6] */\
		"vsubpd		%%ymm0	,%%ymm4,%%ymm4		\n\t		vsubpd		%%ymm9 ,%%ymm13,%%ymm13		\n\t"\
		"vsubpd		%%ymm1	,%%ymm5,%%ymm5		\n\t		vsubpd		%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd		%%ymm2	,%%ymm6,%%ymm6		\n\t		vsubpd		%%ymm11,%%ymm15,%%ymm15		\n\t"\
		"vsubpd		%%ymm3	,%%ymm7,%%ymm7		\n\t		vsubpd		%%ymm14,%%ymm10,%%ymm10		\n\t"\
	"movq	%[two]	,%%r15			\n\t"\
	"vmovaps	%%ymm14,(%%rbx) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	 (%%r15),%%ymm14	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm4,%%ymm0		\n\t	vfmadd132pd		%%ymm14,%%ymm13,%%ymm9 		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm5,%%ymm1		\n\t	vfmadd132pd		%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm6,%%ymm2		\n\t	vfmadd132pd		%%ymm14,%%ymm15,%%ymm11		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm7,%%ymm3		\n\t	vfmadd132pd		(%%rbx),%%ymm10,%%ymm14		\n\t"\
		"vsubpd	%%ymm2		,%%ymm0,%%ymm0		\n\t		vsubpd	%%ymm11		,%%ymm10,%%ymm10	\n\t"\
		"vsubpd	%%ymm3		,%%ymm1,%%ymm1		\n\t		vsubpd	%%ymm14		,%%ymm15,%%ymm15	\n\t"\
		"vsubpd	%%ymm7		,%%ymm4,%%ymm4		\n\t		vfmadd132pd	(%%r15) ,%%ymm10,%%ymm11	\n\t"\
		"vsubpd	%%ymm6		,%%ymm5,%%ymm5		\n\t		vfmadd132pd	(%%r15) ,%%ymm15,%%ymm14	\n\t"\
	"movslq		0x04(%%rdi),%%rcx	\n\t	movslq		0x14(%%rdi),%%r12	\n\t"\
	"leaq	(%%rsi,%%rcx,8),%%rcx	\n\t	leaq	(%%rsi,%%r12,8),%%r12	\n\t"/* out0 + off[1,5] */\
	"movslq		0x0c(%%rdi),%%rdx	\n\t	movslq		0x1c(%%rdi),%%r13	\n\t"\
	"leaq	(%%rsi,%%rdx,8),%%rdx	\n\t	leaq	(%%rsi,%%r13,8),%%r13	\n\t"/* out0 + off[3,7] */\
	/* Use the cosine term of the [c1,s1] pair, which is the *middle* [4th of 7] of our 7 input pairs, in terms \
	of the input-arg bit-reversal reordering defined in the __X[c,s] --> [c,s] mapping below and happens to \
	always in fact *be* a true cosine term, which is a requirement for our "decr 1 gives isrt2" data-copy scheme: */\
		"movq	%[twid_ptrs],%%r14		\n\t	movq	0x30(%%r14),%%r14	\n\t"\
		"										subq	$0x20,%%r14	\n\t"/* isrt2 in [c1]-1 */\
	"vfmadd132pd	(%%r15),%%ymm0,%%ymm2		\n\t	vfnmadd231pd	(%%r14),%%ymm10,%%ymm8 		\n\t"/* .isrt2 */\
	"vfmadd132pd	(%%r15),%%ymm1,%%ymm3		\n\t	vfnmadd231pd	(%%r14),%%ymm15,%%ymm13		\n\t"\
	"vfmadd132pd	(%%r15),%%ymm5,%%ymm6		\n\t	vfnmadd231pd	(%%r14),%%ymm11,%%ymm9 		\n\t"\
	"vfmadd132pd	(%%r15),%%ymm4,%%ymm7		\n\t	vfnmadd231pd	(%%r14),%%ymm14,%%ymm12		\n\t"\
		"vmovaps	%%ymm2,    (%%rax)	\n\t"/* [o0].re */"	vmovaps	%%ymm8 ,    (%%r12)	\n\t"/* [o5].re */	"vmovaps	(%%r14),%%ymm2		\n\t"/* ymm2 = ISRT2 */\
		"vmovaps	%%ymm3,0x20(%%rax)	\n\t"/* [o0].im */"	vmovaps	%%ymm13,0x20(%%r11)	\n\t"/* [o6].im */	"vaddpd	%%ymm2,%%ymm2,%%ymm2	\n\t"/* ymm2 = SQRT2; */\
		"vmovaps	%%ymm6,0x20(%%rbx)	\n\t"/* [o2].im */"	vmovaps	%%ymm9 ,0x20(%%r12)	\n\t"/* [o5].im */\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"/* [o3].re */"	vmovaps	%%ymm12,    (%%r11)	\n\t"/* [o6].re */\
		"												 vfmadd132pd	%%ymm2,%%ymm8 ,%%ymm10		\n\t"/* .sqrt2 */\
		"												 vfmadd132pd	%%ymm2,%%ymm13,%%ymm15		\n\t"\
		"												 vfmadd132pd	%%ymm2,%%ymm9 ,%%ymm11		\n\t"\
		"												 vfmadd132pd	%%ymm2,%%ymm12,%%ymm14		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"/* [o1].re */"	vmovaps	%%ymm10,    (%%r10)	\n\t"/* [o4].re */\
		"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"/* [o1].im */"	vmovaps	%%ymm15,0x20(%%r13)	\n\t"/* [o7].im */\
		"vmovaps	%%ymm4,    (%%rbx)	\n\t"/* [o2].re */"	vmovaps	%%ymm11,0x20(%%r10)	\n\t"/* [o4].im */\
		"vmovaps	%%ymm5,0x20(%%rdx)	\n\t"/* [o3].im */"	vmovaps	%%ymm14,    (%%r13)	\n\t"/* [o7].re */\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "e" (Xi1)	/* ...except for 'e'-inputs which are literal byte offsets */\
		 ,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		 ,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Twiddleless version of SSE2_RADIX8_DIT_TWIDDLE. Inputs enter in memory locations __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7.
	Outputs go into 16 contiguous 32-byte memory locations starting at __out and assumed disjoint with inputs.
	This macro built on the same code template as SSE2_RADIX8_DIF_TWIDDLE0, but with the I/O-location indices mutually bit reversed:
	01234567 <--> 04261537, which can be effected via the pairwise swaps 1 <--> 4 and 3 <--> 6.
	*/
	#define	SSE2_RADIX8_DIT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xout, Xisrt2,Xtwo)\
	{\
	__asm__ volatile (\
		"movq	%[__isrt2],%%rdi				\n\t		movq	%[__two],%%r9	\n\t"/* r9 holds 2.0 throughout */\
		"movq	%[__out],%%rsi	\n\t"\
	/* 1st of 2 radix-4 subtransforms, data in ymm0-7: *//* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\
		"movq	%[__i0],%%rax					\n\t		movq	%[__i4],%%r10					\n\t"\
		"movq	%[__i1],%%rbx					\n\t		movq	%[__i5],%%r11					\n\t"\
		"movq	%[__i2],%%rcx					\n\t		movq	%[__i6],%%r12					\n\t"\
		"movq	%[__i3],%%rdx					\n\t		movq	%[__i7],%%r13					\n\t"\
		"vmovaps	    (%%rax),%%ymm2				\n\t		vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3				\n\t		vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6				\n\t		vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7				\n\t		vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0				\n\t		vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4				\n\t		vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1				\n\t		vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5				\n\t"	/*	vmovaps	0x20(%%r13),%%ymm13	Instead use ymm13 for 2.0: */"	vmovaps	(%%r9),%%ymm13 	\n\t"\
		"vsubpd		%%ymm0,%%ymm2,%%ymm2			\n\t		vsubpd		%%ymm8 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd		%%ymm1,%%ymm3,%%ymm3			\n\t		vsubpd		%%ymm9 ,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm4,%%ymm6,%%ymm6			\n\t		vsubpd		%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vsubpd		%%ymm5,%%ymm7,%%ymm7			\n\t		vsubpd	0x20(%%r13),%%ymm15,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm2,%%ymm0			\n\t	vfmadd132pd		%%ymm13,%%ymm10,%%ymm8 		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm3,%%ymm1			\n\t	vfmadd132pd		%%ymm13,%%ymm11,%%ymm9 		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm6,%%ymm4			\n\t	vfmadd132pd		%%ymm13,%%ymm14,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm7,%%ymm5			\n\t	vfmadd132pd	0x20(%%r13),%%ymm15,%%ymm13		\n\t"\
		"vsubpd		%%ymm7,%%ymm2,%%ymm2			\n\t		vsubpd		%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd		%%ymm6,%%ymm3,%%ymm3			\n\t		vsubpd		%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd		%%ymm4,%%ymm0,%%ymm0			\n\t		vsubpd		%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm5,%%ymm1,%%ymm1			\n\t		vsubpd		%%ymm15,%%ymm10,%%ymm10		\n\t"\
	"vmovaps	%%ymm14,(%%rsi) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r9),%%ymm14 	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm4			\n\t	vfmadd132pd		%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm5			\n\t	vfmadd132pd		%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm7			\n\t	vfmadd132pd		%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm6			\n\t	vfmadd132pd		(%%rsi),%%ymm11,%%ymm14		\n\t"\
		"														vsubpd		%%ymm15,%%ymm11,%%ymm11		\n\t"\
		"														vsubpd		%%ymm10,%%ymm14,%%ymm14		\n\t"\
		"													vfmadd132pd		(%%r9 ),%%ymm11,%%ymm15		\n\t"/* .two */\
		"													vfmadd132pd		(%%r9 ),%%ymm14,%%ymm10		\n\t"\
		/* Outputs 1-7 order-reversed in the SIMD version of this macro: Thus swap output byte-offset pairs */\
		/* 0x[40,60] <-> [1c0,1e0], [80,a0] <-> [180,1a0], [c0,e0] <-> [140,160] : */\
		"vsubpd		%%ymm9 ,%%ymm0 ,%%ymm0 			\n\t	vfnmadd231pd	(%%rdi),%%ymm14,%%ymm2 		\n\t"/* .isrt2 */\
		"vsubpd		%%ymm13,%%ymm5 ,%%ymm5 			\n\t	vfnmadd231pd	(%%rdi),%%ymm11,%%ymm3 		\n\t"\
		"vsubpd		%%ymm12,%%ymm4 ,%%ymm4 			\n\t	vfnmadd231pd	(%%rdi),%%ymm15,%%ymm7 		\n\t"\
		"vsubpd		%%ymm8 ,%%ymm1 ,%%ymm1 			\n\t	vfnmadd231pd	(%%rdi),%%ymm10,%%ymm6 		\n\t"\
	"vmovaps	%%ymm8 ,0x20(%%rsi) 	\n\t"/* spill ymm8  to make room for 2.0 */"	vmovaps	 (%%r9),%%ymm8	\n\t"/* two */\
	"vmovaps	%%ymm10,(%%rsi) 	\n\t"/* spill ymm10 to make room for sqrt2 */"	vmovaps	0x40(%%r9),%%ymm10	\n\t"/* sqrt2 */\
		"vmovaps	%%ymm0 ,0x080(%%rsi)			\n\t		vmovaps		%%ymm2 ,0x040(%%rsi)		\n\t"\
		"vmovaps	%%ymm5 ,0x120(%%rsi)			\n\t		vmovaps		%%ymm3 ,0x0e0(%%rsi)		\n\t"\
		"vmovaps	%%ymm4 ,0x100(%%rsi)			\n\t		vmovaps		%%ymm7 ,0x0c0(%%rsi)		\n\t"\
		"vmovaps	%%ymm1 ,0x1a0(%%rsi)			\n\t		vmovaps		%%ymm6 ,0x160(%%rsi)		\n\t"\
	"vfmadd132pd		%%ymm8 ,%%ymm0 ,%%ymm9 		\n\t	vfmadd132pd		%%ymm10,%%ymm2 ,%%ymm14		\n\t"\
	"vfmadd132pd		%%ymm8 ,%%ymm5 ,%%ymm13		\n\t	vfmadd132pd		%%ymm10,%%ymm3 ,%%ymm11		\n\t"\
	"vfmadd132pd		%%ymm8 ,%%ymm4 ,%%ymm12		\n\t	vfmadd132pd		%%ymm10,%%ymm7 ,%%ymm15		\n\t"\
	"vfmadd132pd	0x20(%%rsi),%%ymm1 ,%%ymm8 		\n\t	vfmadd132pd		(%%rsi),%%ymm6 ,%%ymm10		\n\t"\
		"vmovaps	%%ymm9 ,0x180(%%rsi)			\n\t		vmovaps		%%ymm14,0x140(%%rsi)		\n\t"\
		"vmovaps	%%ymm13,0x020(%%rsi)			\n\t		vmovaps		%%ymm11,0x1e0(%%rsi)		\n\t"\
		"vmovaps	%%ymm12,     (%%rsi)			\n\t		vmovaps		%%ymm15,0x1c0(%%rsi)		\n\t"\
		"vmovaps	%%ymm8 ,0x0a0(%%rsi)			\n\t		vmovaps		%%ymm10,0x060(%%rsi)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__out] "m" (Xout)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__two] "m" (Xtwo)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Same as SSE2_RADIX8_DIT_0TWIDDLE but with user-specifiable [i.e. not nec. contiguous] output addresses:
	#define	SSE2_RADIX8_DIT_0TWIDDLE_OOP(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2,Xtwo)\
	{\
	__asm__ volatile (\
		"movq	%[__isrt2],%%rdi				\n\t		movq	%[__two],%%r9	\n\t"/* r9 holds 2.0 throughout */\
	/* 1st of 2 radix-4 subtransforms, data in ymm0-7: *//* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\
		"movq	%[__i0],%%rax					\n\t		movq	%[__i4],%%r10					\n\t"\
		"movq	%[__i1],%%rbx					\n\t		movq	%[__i5],%%r11					\n\t"\
		"movq	%[__i2],%%rcx					\n\t		movq	%[__i6],%%r12					\n\t"\
		"movq	%[__i3],%%rdx					\n\t		movq	%[__i7],%%r13					\n\t"\
		"vmovaps	    (%%rax),%%ymm2				\n\t		vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3				\n\t		vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6				\n\t		vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7				\n\t		vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0				\n\t		vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4				\n\t		vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1				\n\t		vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5				\n\t"	/*	vmovaps	0x20(%%r13),%%ymm13	Instead use ymm13 for 2.0: */"	vmovaps	(%%r9),%%ymm13 	\n\t"\
		"vsubpd		%%ymm0,%%ymm2,%%ymm2			\n\t		vsubpd		%%ymm8 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd		%%ymm1,%%ymm3,%%ymm3			\n\t		vsubpd		%%ymm9 ,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm4,%%ymm6,%%ymm6			\n\t		vsubpd		%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vsubpd		%%ymm5,%%ymm7,%%ymm7			\n\t		vsubpd	0x20(%%r13),%%ymm15,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm2,%%ymm0			\n\t	vfmadd132pd		%%ymm13,%%ymm10,%%ymm8 		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm3,%%ymm1			\n\t	vfmadd132pd		%%ymm13,%%ymm11,%%ymm9 		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm6,%%ymm4			\n\t	vfmadd132pd		%%ymm13,%%ymm14,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm7,%%ymm5			\n\t	vfmadd132pd	0x20(%%r13),%%ymm15,%%ymm13		\n\t"\
		"vsubpd		%%ymm7,%%ymm2,%%ymm2			\n\t		vsubpd		%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd		%%ymm6,%%ymm3,%%ymm3			\n\t		vsubpd		%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd		%%ymm4,%%ymm0,%%ymm0			\n\t		vsubpd		%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vsubpd		%%ymm5,%%ymm1,%%ymm1			\n\t		vsubpd		%%ymm15,%%ymm10,%%ymm10		\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r9),%%ymm14 	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm4			\n\t	vfmadd132pd		%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm5			\n\t	vfmadd132pd		%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm7			\n\t	vfmadd132pd		%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm6			\n\t	vfmadd132pd		(%%rax),%%ymm11,%%ymm14		\n\t"\
		"														vsubpd		%%ymm15,%%ymm11,%%ymm11		\n\t"\
		"														vsubpd		%%ymm10,%%ymm14,%%ymm14		\n\t"\
		"													vfmadd132pd		(%%r9 ),%%ymm11,%%ymm15		\n\t"/* .two */\
		"													vfmadd132pd		(%%r9 ),%%ymm14,%%ymm10		\n\t"\
		/* Outputs 1-7 order-reversed in the SIMD version of this macro: Thus swap output byte-offset pairs */\
		/* 0x[40,60] <-> [1c0,1e0], [80,a0] <-> [180,1a0], [c0,e0] <-> [140,160] : */\
		"movq	%[__o0],%%rax						\n\t		movq	%[__o4],%%r10					\n\t"\
		"movq	%[__o1],%%rbx						\n\t		movq	%[__o5],%%r11					\n\t"\
		"vsubpd		%%ymm9 ,%%ymm0 ,%%ymm0 			\n\t	vfnmadd231pd	(%%rdi),%%ymm14,%%ymm2 		\n\t"/* .isrt2 */\
		"vsubpd		%%ymm13,%%ymm5 ,%%ymm5 			\n\t	vfnmadd231pd	(%%rdi),%%ymm11,%%ymm3 		\n\t"\
		"vsubpd		%%ymm12,%%ymm4 ,%%ymm4 			\n\t	vfnmadd231pd	(%%rdi),%%ymm15,%%ymm7 		\n\t"\
		"vsubpd		%%ymm8 ,%%ymm1 ,%%ymm1 			\n\t	vfnmadd231pd	(%%rdi),%%ymm10,%%ymm6 		\n\t"\
		"movq	%[__o2],%%rcx						\n\t		movq	%[__o6],%%r12					\n\t"\
		"movq	%[__o3],%%rdx						\n\t		movq	%[__o7],%%r13					\n\t"\
	"vmovaps	%%ymm8 ,0x20(%%rax) 	\n\t"/* spill ymm8  to make room for 2.0 */"	vmovaps	 (%%r9),%%ymm8	\n\t"/* two */\
	"vmovaps	%%ymm10,(%%rax) 	\n\t"/* spill ymm10 to make room for sqrt2 */"	vmovaps	0x40(%%r9),%%ymm10	\n\t"/* sqrt2 */\
		"vmovaps	%%ymm0 ,    (%%rcx)				\n\t		vmovaps		%%ymm2 ,    (%%rbx)		\n\t"\
		"vmovaps	%%ymm5 ,0x20(%%r10)				\n\t		vmovaps		%%ymm3 ,0x20(%%rdx)		\n\t"\
		"vmovaps	%%ymm4 ,    (%%r10)				\n\t		vmovaps		%%ymm7 ,    (%%rdx)		\n\t"\
		"vmovaps	%%ymm1 ,0x20(%%r12)				\n\t		vmovaps		%%ymm6 ,0x20(%%r11)		\n\t"\
	"vfmadd132pd		%%ymm8 ,%%ymm0 ,%%ymm9 		\n\t	vfmadd132pd		%%ymm10,%%ymm2 ,%%ymm14		\n\t"\
	"vfmadd132pd		%%ymm8 ,%%ymm5 ,%%ymm13		\n\t	vfmadd132pd		%%ymm10,%%ymm3 ,%%ymm11		\n\t"\
	"vfmadd132pd		%%ymm8 ,%%ymm4 ,%%ymm12		\n\t	vfmadd132pd		%%ymm10,%%ymm7 ,%%ymm15		\n\t"\
	"vfmadd132pd	0x20(%%rax),%%ymm1 ,%%ymm8 		\n\t	vfmadd132pd		(%%rax),%%ymm6 ,%%ymm10		\n\t"\
		"vmovaps	%%ymm9 ,    (%%r12)				\n\t		vmovaps		%%ymm14,    (%%r11)		\n\t"\
		"vmovaps	%%ymm13,0x20(%%rax)				\n\t		vmovaps		%%ymm11,0x20(%%r13)		\n\t"\
		"vmovaps	%%ymm12,    (%%rax)				\n\t		vmovaps		%%ymm15,    (%%r13)		\n\t"\
		"vmovaps	%%ymm8 ,0x20(%%rcx)				\n\t		vmovaps		%%ymm10,0x20(%%rbx)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__two] "m" (Xtwo)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// AVX2 Opcount: 84 vec MEM [30 implicit], 31 ADD/SUB, 50 MUL, 36 FMA, i.e. trade 36 ADD+MUL for 36 FMA (plus one more ADD to generate SQRT2 from ISRT2).
	//*** Dec 2020: For guide to updated reduced-#arg IO address computation, cf. the SSE2 version of this macro. ***
	#define SSE2_RADIX8_DIT_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xo_off, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
	/* i1 is base byte-offset, no need to lshift it prior to add: */\
		"xorq	%%r8,%%r8	\n\t	leaq	%c[i1](%%r8),%%r8	\n\t"/* movq|movslq of literal %c[i1] both segfaulted, workaround via LEA */\
		/* The twid_ptrs[] array holds ptrs to 14 complex twiddles in-order: (c,s)[1,2,3,4,5,6,7]: */\
		"movq	%[twid_ptrs],%%r14	\n\t"\
	/* Block 0/1 has just one twiddle-CMUL: 												/* Blocks 2/3 use separate register subset, can be done overlapped with 0/1: */\
	"movq		%[in0],%%rax		\n\t"\
		"leaq	(%%rax,%%r8  ),%%rbx	\n\t"\
		"leaq	(%%rax,%%r8,2),%%rcx	\n\t											movq	0x10(%%r14),%%r10				\n\t	movq	0x20(%%r14),%%r12			\n\t"/* c2,c3 */\
		"leaq	(%%rcx,%%r8  ),%%rdx	\n\t											movq	0x18(%%r14),%%r11				\n\t	movq	0x28(%%r14),%%r13			\n\t"/* s2,s3 */\
	"movq	    (%%r14),%%rdi	\n\t/* [rdi,rsi] -> [c,s] components of each sincos pair, */	vmovaps		(%%rcx),%%ymm8 		\n\t	vmovaps		0x20(%%rcx),%%ymm9 		\n\t"/* lcol: c1,s1 */\
	"movq	0x08(%%r14),%%rsi	\n\t/* (not truly a pair here in terms of rel-addresses). */	vmovaps	%%ymm9 ,%%ymm10			\n\t	vmovaps		%%ymm8 ,%%ymm11			\n\t"\
	"vmovaps	    (%%rbx),%%ymm4 		\n\t	vmovaps		0x20(%%rbx),%%ymm5 		\n\t	vmovaps		(%%rdx),%%ymm12			\n\t	vmovaps		0x20(%%rdx),%%ymm13		\n\t"\
	"vmovaps	    (%%rax),%%ymm0 		\n\t	vmovaps		0x20(%%rax),%%ymm1 		\n\t	vmovaps	%%ymm13,%%ymm14				\n\t	vmovaps		%%ymm12,%%ymm15			\n\t"\
	"vmovaps	%%ymm5 ,%%ymm6 			\n\t	vmovaps		%%ymm4 ,%%ymm7 			\n\t	vmulpd		(%%r10),%%ymm8 ,%%ymm8 	\n\t	vmulpd		(%%r10),%%ymm9 ,%%ymm9 	\n\t"\
	"vmulpd		 (%%rdi),%%ymm4 ,%%ymm4 \n\t	vmulpd		(%%rdi),%%ymm5 ,%%ymm5 	\n\t	vmulpd		(%%r12),%%ymm12,%%ymm12	\n\t	vmulpd		(%%r12),%%ymm13,%%ymm13	\n\t"\
	"vfmadd231pd (%%rsi),%%ymm6 ,%%ymm4 \n\t   vfnmadd231pd (%%rsi),%%ymm7 ,%%ymm5 	\n\t	vfmadd231pd (%%r11),%%ymm10,%%ymm8 	\n\t   vfnmadd231pd (%%r11),%%ymm11,%%ymm9 	\n\t"\
	"vmovaps	%%ymm0 ,%%ymm2 			\n\t	vmovaps		%%ymm1 ,%%ymm3 			\n\t	vfmadd231pd (%%r13),%%ymm14,%%ymm12	\n\t   vfnmadd231pd (%%r13),%%ymm15,%%ymm13	\n\t"\
	"vaddpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t	vaddpd		%%ymm5 ,%%ymm1 ,%%ymm1 	\n\t	vmovaps		%%ymm8 ,%%ymm10			\n\t	vmovaps		%%ymm9 ,%%ymm11			\n\t"\
	"vsubpd		%%ymm4 ,%%ymm2 ,%%ymm2 	\n\t	vsubpd		%%ymm5 ,%%ymm3 ,%%ymm3 	\n\t	vaddpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vaddpd		%%ymm13,%%ymm9 ,%%ymm9 	\n\t"\
	"vmovaps	%%ymm0 ,    (%%rax)		\n\t	vmovaps		%%ymm1 ,0x20(%%rax)		\n\t	vsubpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vsubpd		%%ymm13,%%ymm11,%%ymm11	\n\t"\
	"vmovaps	%%ymm2 ,    (%%rbx)		\n\t	vmovaps		%%ymm3 ,0x20(%%rbx)		\n\t	vmovaps		%%ymm8 ,    (%%rcx)		\n\t	vmovaps		%%ymm9 ,0x20(%%rcx)		\n\t"\
														/* Now do radix-2 butterfly: */	"	vmovaps		%%ymm10,    (%%rdx)		\n\t	vmovaps		%%ymm11,0x20(%%rdx)		\n\t"\
	/* Blocks 4/5: */																		/* Blocks 6/7 use separate register subset, can be done overlapped with 4/5: */\
	"shlq	$2,%%r8			\n\t"/* From here on only need offset i4 = 4*i1 */\
	"addq	%%r8,%%rax		\n\t"/* Remaining 4 I-address-calculations are in-place += i4, so use ADD, faster than LEA */\
	"addq	%%r8,%%rbx		\n\t"\
	"addq	%%r8,%%rcx		\n\t"\
	"addq	%%r8,%%rdx		\n\t"\
	"vmovaps		(%%rax),%%ymm0 		\n\t	vmovaps		0x20(%%rax),%%ymm1 		\n\t	vmovaps		(%%rcx),%%ymm8 			\n\t	vmovaps		0x20(%%rcx),%%ymm9 		\n\t"\
	"vmovaps	%%ymm1 ,%%ymm2 			\n\t	vmovaps		%%ymm0 ,%%ymm3 			\n\t	vmovaps		%%ymm9 ,%%ymm10			\n\t	vmovaps		%%ymm8 ,%%ymm11			\n\t"\
	"vmovaps		(%%rbx),%%ymm4 		\n\t	vmovaps		0x20(%%rbx),%%ymm5 		\n\t	vmovaps		(%%rdx),%%ymm12			\n\t	vmovaps		0x20(%%rdx),%%ymm13		\n\t"\
	"vmovaps	%%ymm5 ,%%ymm6 			\n\t	vmovaps		%%ymm4 ,%%ymm7 			\n\t	vmovaps		%%ymm13,%%ymm14			\n\t	vmovaps		%%ymm12,%%ymm15			\n\t"\
	"subq		%%r8,%%rax			\n\t"\
	"subq		%%r8,%%rbx			\n\t"\
	"subq		%%r8,%%rcx			\n\t"\
	"subq		%%r8,%%rdx			\n\t"\
	"movq	0x30(%%r14),%%rdi			\n\t												movq	0x50(%%r14),%%r10				\n\t"/* c4,c6 */\
	"movq	0x40(%%r14),%%r8 			\n\t												movq	0x60(%%r14),%%r12				\n\t"/* c5,c7 */\
	"movq	0x38(%%r14),%%rsi			\n\t												movq	0x58(%%r14),%%r11				\n\t"/* s4,s6 */\
	"movq	0x48(%%r14),%%r9 			\n\t												movq	0x68(%%r14),%%r13				\n\t"/* s5,s7 */\
	"vmulpd		 (%%rdi),%%ymm0 ,%%ymm0 \n\t	vmulpd		 (%%rdi),%%ymm1 ,%%ymm1 \n\t	vmulpd		(%%r10),%%ymm8 ,%%ymm8 	\n\t	vmulpd		 (%%r10),%%ymm9 ,%%ymm9 \n\t"\
	"vmulpd		 (%%r8 ),%%ymm4 ,%%ymm4 \n\t	vmulpd		 (%%r8 ),%%ymm5 ,%%ymm5 \n\t	vmulpd		(%%r12),%%ymm12,%%ymm12	\n\t	vmulpd		 (%%r12),%%ymm13,%%ymm13\n\t"\
	"vfmadd231pd (%%rsi),%%ymm2 ,%%ymm0 \n\t	vfnmadd231pd (%%rsi),%%ymm3 ,%%ymm1 \n\t	vfmadd231pd (%%r11),%%ymm10,%%ymm8 	\n\t	vfnmadd231pd (%%r11),%%ymm11,%%ymm9 \n\t"\
	"vfmadd231pd (%%r9 ),%%ymm6 ,%%ymm4 \n\t	vfnmadd231pd (%%r9 ),%%ymm7 ,%%ymm5 \n\t	vfmadd231pd (%%r13),%%ymm14,%%ymm12	\n\t	vfnmadd231pd (%%r13),%%ymm15,%%ymm13\n\t"\
	/* Now do radix-2 butterfly: */\
	"vmovaps	%%ymm0 ,%%ymm2 			\n\t	vmovaps		%%ymm1 ,%%ymm3 			\n\t	vmovaps		%%ymm8 ,%%ymm10			\n\t	vmovaps		%%ymm9 ,%%ymm11			\n\t"\
	"vaddpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t	vaddpd		%%ymm5 ,%%ymm1 ,%%ymm1 	\n\t	vaddpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vaddpd		%%ymm13,%%ymm9 ,%%ymm9 	\n\t"\
	"vsubpd		%%ymm4 ,%%ymm2 ,%%ymm2 	\n\t	vsubpd		%%ymm5 ,%%ymm3 ,%%ymm3 	\n\t	vsubpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vsubpd		%%ymm13,%%ymm11,%%ymm11	\n\t"\
	/* Reload Block 0-3 outputs into r4-7,c-f, combine to get the 2 length-4 subtransform... */\
	"vmovaps		(%%rax),%%ymm4 		\n\t	vmovaps		0x20(%%rax),%%ymm5 		\n\t"\
	"vmovaps		(%%rbx),%%ymm6 		\n\t	vmovaps		0x20(%%rbx),%%ymm7 		\n\t"\
	"vmovaps		(%%rcx),%%ymm12		\n\t	vmovaps		0x20(%%rcx),%%ymm13		\n\t"\
	"vmovaps		(%%rdx),%%ymm14		\n\t	vmovaps		0x20(%%rdx),%%ymm15		\n\t"\
	"movq		%[out0],%%rax			\n\t	movq		%[o_off],%%r8		\n\t"/* out0, off1 */\
	"movq		%[two],%%rsi			\n\t	leaq		(%%r8,%%r8),%%r9	\n\t"/* (vec_dbl)2.0, off2 */\
		"										leaq		(%%r9,%%r9),%%r10	\n\t"/* off4 */\
	"vsubpd		%%ymm12,%%ymm4 ,%%ymm4 	\n\t	vsubpd		%%ymm13,%%ymm5 ,%%ymm5 	\n\t"\
	"vsubpd		%%ymm15,%%ymm6 ,%%ymm6 	\n\t	vsubpd		%%ymm14,%%ymm7 ,%%ymm7 	\n\t"\
	"vsubpd		%%ymm8 ,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm9 ,%%ymm1 ,%%ymm1 	\n\t"\
	"vsubpd		%%ymm11,%%ymm2 ,%%ymm2 	\n\t	vsubpd		%%ymm10,%%ymm3 ,%%ymm3 	\n\t"\
	/* We hope the microcode execution engine inlines the MULs with the above SUBs: */\
	"vmovaps	%%ymm10,(%%rdx) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%rsi),%%ymm10 \n\t"/* two */\
	"vfmadd132pd %%ymm10,%%ymm4,%%ymm12	\n\t	vfmadd132pd %%ymm10,%%ymm5 ,%%ymm13	\n\t"\
	"vfmadd132pd %%ymm10,%%ymm6,%%ymm15	\n\t	vfmadd132pd %%ymm10,%%ymm7 ,%%ymm14	\n\t"\
	"vfmadd132pd %%ymm10,%%ymm0,%%ymm8 	\n\t	vfmadd132pd %%ymm10,%%ymm1 ,%%ymm9 	\n\t"\
	"vfmadd132pd %%ymm10,%%ymm2,%%ymm11	\n\t	vfmadd132pd (%%rdx),%%ymm3 ,%%ymm10	\n\t"\
	/* In terms of our original scalar-code prototyping macro, the data are: __tr0 = _r[c,f,4,6,8,b,0,2], __ti0 = _r[d,7,5,e,9,3,1,a]; */\
	/* Now combine the two half-transforms: */\
	/* Need r2/3+- a/b combos for the *ISRT2 preceding the output 4-7 radix-2 butterflies, so start them first: */\
	"vsubpd		%%ymm3 ,%%ymm11,%%ymm11	\n\t	vsubpd		%%ymm10,%%ymm2 ,%%ymm2 	\n\t"\
	"vsubpd		%%ymm8 ,%%ymm12,%%ymm12	\n\t	vsubpd		%%ymm9 ,%%ymm13,%%ymm13	\n\t"\
	"vsubpd		%%ymm1 ,%%ymm4 ,%%ymm4 	\n\t	vsubpd		%%ymm0 ,%%ymm5 ,%%ymm5 	\n\t"\
	"vmovaps	%%ymm0 ,(%%rdx) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%rsi),%%ymm0  \n\t"/* two */\
	"vfmadd132pd %%ymm0,%%ymm11,%%ymm3 	\n\t	vfmadd132pd %%ymm0 ,%%ymm2 ,%%ymm10	\n\t"\
	"vfmadd132pd %%ymm0,%%ymm12,%%ymm8 	\n\t	vfmadd132pd %%ymm0 ,%%ymm13,%%ymm9 	\n\t"\
	"vfmadd132pd %%ymm0,%%ymm4 ,%%ymm1 	\n\t	vfmadd132pd (%%rdx),%%ymm5 ,%%ymm0 	\n\t"\
	/*movq		%[o0],%%rax		[o0] already in rax */	\
	"leaq	(%%rax,%%r9 ),%%rcx		\n\t"/* out0 + off2, compute first to allow time for LEA to finish before += off4 to get out0 + off6 */\
	"leaq	(%%rax,%%r10),%%rbx		\n\t"/* out0 + off4 */\
	"leaq	(%%rcx,%%r10),%%rdx		\n\t"/* out0 + off6 */\
	"vmovaps	%%ymm12,    (%%rbx)		\n\t	vmovaps		%%ymm13,0x20(%%rbx)		\n\t"/* __Br1 = _rc;	__Bi1 = _rd; */\
	/* Use that _rc,d free to stick ISRT2 into _rc and SQRT2 into _rd: */\
	"vmovaps	-0x20(%%rdi),%%ymm12	\n\t	vaddpd	%%ymm12,%%ymm12,%%ymm13		\n\t"/* ymm12 = ISRT2;	ymm13 = SQRT2; */\
	"vmovaps	%%ymm4 ,    (%%rdx)		\n\t	vmovaps		%%ymm0 ,0x20(%%rdx)		\n\t"/* __Br3 = _r4;	__Bi3 = _r0; */\
	"vmovaps	%%ymm8 ,    (%%rax)		\n\t	vmovaps		%%ymm9 ,0x20(%%rax)		\n\t"/* __Br0 = _r8;	__Bi0 = _r9; */\
	"vmovaps	%%ymm1 ,    (%%rcx)		\n\t	vmovaps		%%ymm5 ,0x20(%%rcx)		\n\t"/* __Br2 = _r1;	__Bi2 = _r5; */\
	"vfnmadd231pd %%ymm12,%%ymm3,%%ymm15\n\t	vfnmadd231pd %%ymm12,%%ymm11,%%ymm7 \n\t"\
	"vfnmadd231pd %%ymm12,%%ymm2,%%ymm6	\n\t	vfnmadd231pd %%ymm12,%%ymm10,%%ymm14\n\t"\
	" vfmadd132pd %%ymm13,%%ymm15,%%ymm3\n\t	 vfmadd132pd %%ymm13,%%ymm7 ,%%ymm11\n\t"\
	" vfmadd132pd %%ymm13,%%ymm6 ,%%ymm2\n\t	 vfmadd132pd %%ymm13,%%ymm14,%%ymm10\n\t"\
	"addq		%%r8 ,%%rax			\n\t"/* out0 + off[1,5,3,7] */\
	"addq		%%r8 ,%%rbx			\n\t"\
	"addq		%%r8 ,%%rcx			\n\t"\
	"addq		%%r8 ,%%rdx			\n\t"\
	"vmovaps	%%ymm3 ,    (%%rax)		\n\t	vmovaps		%%ymm7 ,0x20(%%rax)		\n\t"/* __Br4 = _r3;	__Bi4 = _r7; */\
	"vmovaps	%%ymm15,    (%%rbx)		\n\t	vmovaps		%%ymm11,0x20(%%rbx)		\n\t"/* __Br5 = _rf;	__Bi5 = _rb; */\
	"vmovaps	%%ymm6 ,    (%%rcx)		\n\t	vmovaps		%%ymm14,0x20(%%rcx)		\n\t"/* __Br6 = _r6;	__Bi6 = _re; */\
	"vmovaps	%%ymm2 ,    (%%rdx)		\n\t	vmovaps		%%ymm10,0x20(%%rdx)		\n\t"/* __Br7 = _r2;	__Bi7 = _ra; */\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "e" (Xi1)	/* ...except for 'e'-inputs which are literal byte offsets */\
		 ,[out0] "m" (Xout0)\
		 ,[o_off] "m" (Xo_off)/* O-address pointer-stride */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

/*** Prefetch odd-index iaddresses in DIF below, even-index oaddresses in SSE2_RADIX16_DIF_TWIDDLE_OOP ***/

	// Based on the SSE2_RADIX16_DIF_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-output addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	/* Dec 2020: Needed to cut #args for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid
	multiple versions of the macro having different arglists. Replace 16 O-addresses with O-base-address
	out0 and pointer to array of 16 int offset-indices: */
	#define SSE2_RADIX16_DIF_0TWIDDLE(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0,Xoff)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%r15	\n\t"/* two, used for FMA-based double-and-ADD/SUBs */\
	/* Block 0: SSE2_RADIX4_DIF_IN_PLACE(r1 , r17, r9 , r25): */	/* Block 2: SSE2_RADIX4_DIF_IN_PLACE(r5 , r21, r13, r29): */\
	"movq	%[__in0],%%rax	\n\t"/* Note BR of r[abcd]x: b<-->c */	"	leaq	%c[__i2](%%rax),%%r10	\n\t"/* addr += 2*ostride */\
	"leaq	%c[__i4](%%rax),%%rcx	\n\t"/* __in0+  [4*istride] */	"	leaq	%c[__i2](%%rcx),%%r12	\n\t"/* w.r.to to Block 0 */\
	"leaq	%c[__i4](%%rcx),%%rbx	\n\t"/* __in0+2*[4*istride] */	"	leaq	%c[__i2](%%rbx),%%r11	\n\t"/* Note BR of r1[0123]: r11<-->r12 */\
	"leaq	%c[__i4](%%rbx),%%rdx	\n\t"/* __in0+3*[4*istride] */	"	leaq	%c[__i2](%%rdx),%%r13	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0							\n\t	vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1							\n\t	vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	    (%%rax),%%ymm2							\n\t	vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3							\n\t	vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4							\n\t	vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5							\n\t	vmovaps	0x20(%%r13),%%ymm13	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6							\n\t	vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7							\n\t	vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vsubpd		%%ymm0 ,%%ymm2,%%ymm2						\n\t	vsubpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"\
		"vsubpd		%%ymm1 ,%%ymm3,%%ymm3						\n\t	vsubpd		%%ymm9 ,%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm4 ,%%ymm6,%%ymm6						\n\t	vsubpd		%%ymm12,%%ymm14,%%ymm14	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm7,%%ymm7						\n\t	vsubpd		%%ymm13,%%ymm15,%%ymm15	\n\t"\
	"vmovaps	%%ymm13,(%%rax) 	\n\t"/* spill ymm13 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm13	\n\t"/* two */\
	"vfmadd132pd	%%ymm13,%%ymm2,%%ymm0						\n\t	vfmadd132pd	%%ymm13,%%ymm10,%%ymm8 	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm3,%%ymm1						\n\t	vfmadd132pd	%%ymm13,%%ymm11,%%ymm9 	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm6,%%ymm4						\n\t	vfmadd132pd	%%ymm13,%%ymm14,%%ymm12	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm7,%%ymm5						\n\t	vfmadd132pd	(%%rax),%%ymm15,%%ymm13	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)							\n\t	vmovaps	%%ymm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)							\n\t	vmovaps	%%ymm9 ,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)							\n\t	vmovaps	%%ymm10,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)							\n\t	vmovaps	%%ymm11,0x20(%%r13)	\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm14	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm4						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm5						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm7						\n\t	vfmadd132pd	%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm6						\n\t	vfmadd132pd	(%%rax),%%ymm11,%%ymm14		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)							\n\t	vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)							\n\t	vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)							\n\t	vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)							\n\t	vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
	/* Block 1: SSE2_RADIX4_DIF_IN_PLACE(r3 , r19, r11, r27): */	/* Block 3: SSE2_RADIX4_DIF_IN_PLACE(r7 , r23, r15, r31): */\
		"addq	$%c[__i1],%%rax	\n\t"/* addr += 1*ostride */"	addq	$%c[__i1],%%r10	\n\t"/* addr += 1*ostride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* w.r.to to Block 0 */"	addq	$%c[__i1],%%r12	\n\t"/* w.r.to to Block 2 */\
		"addq	$%c[__i1],%%rcx							\n\t	addq	$%c[__i1],%%r11	\n\t"\
		"addq	$%c[__i1],%%rdx							\n\t	addq	$%c[__i1],%%r13	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0							\n\t	vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1							\n\t	vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	    (%%rax),%%ymm2							\n\t	vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3							\n\t	vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4							\n\t	vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5							\n\t	vmovaps	0x20(%%r13),%%ymm13	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6							\n\t	vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7							\n\t	vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vsubpd		%%ymm0 ,%%ymm2,%%ymm2						\n\t	vsubpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"\
		"vsubpd		%%ymm1 ,%%ymm3,%%ymm3						\n\t	vsubpd		%%ymm9 ,%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm4 ,%%ymm6,%%ymm6						\n\t	vsubpd		%%ymm12,%%ymm14,%%ymm14	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm7,%%ymm7						\n\t	vsubpd		%%ymm13,%%ymm15,%%ymm15	\n\t"\
	"vmovaps	%%ymm13,(%%rax) 	\n\t"/* spill ymm13 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm13	\n\t"/* two */\
	"vfmadd132pd	%%ymm13,%%ymm2,%%ymm0						\n\t	vfmadd132pd	%%ymm13,%%ymm10,%%ymm8 	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm3,%%ymm1						\n\t	vfmadd132pd	%%ymm13,%%ymm11,%%ymm9 	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm6,%%ymm4						\n\t	vfmadd132pd	%%ymm13,%%ymm14,%%ymm12	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm7,%%ymm5						\n\t	vfmadd132pd	(%%rax),%%ymm15,%%ymm13	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)							\n\t	vmovaps	%%ymm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)							\n\t	vmovaps	%%ymm9 ,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)							\n\t	vmovaps	%%ymm10,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)							\n\t	vmovaps	%%ymm11,0x20(%%r13)	\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm14	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm4						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm5						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm7						\n\t	vfmadd132pd	%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm6						\n\t	vfmadd132pd	(%%rax),%%ymm11,%%ymm14		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)							\n\t	vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)							\n\t	vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)							\n\t	vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)							\n\t	vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
	/*****************************************************************************************
	**** Now do 4 DFTs with internal twiddles on the 1*stride - separated data. Do blocks ****
	**** in order 0,2,1,3 to allow increment-only of rsi-datum from 1 block to the next:  ****
	*****************************************************************************************/\
	/* Problem: In the sse2 and avx versions of the reduced-#args macro with their single columns of
	instructions, had plenty of GPRs to store both I and O-addresses simultaneously. In the 2-column avx2
	and avx-512 versions, don't have enough GPRs. But since we don't need the O-addresses until we are ready
	to write outputs, just move the O-address computations down to that part of each 4-DFT sub-block: */\
	/* Block 0: r0-3 */												/* Block 1: r8-b */\
		"movq	%[__in0],%%rsi	\n\t	leaq %c[__i4](%%rsi),%%r8 \n\t addq $%c[__i4],%%r8 \n\t"/* __in0+[0,8]*ostride */\
	/* Need separate address for Im parts of outputs due to literal-offsets below: */\
		"leaq	0x20(%%rsi),%%rdi								\n\t	leaq	0x20(%%r8 ),%%r9 	\n\t"\
		"vmovaps	        (%%rsi),%%ymm0						\n\t	vmovaps	        (%%r8 ),%%ymm8 	\n\t"/* ar */\
		"vmovaps	        (%%rdi),%%ymm1						\n\t	vmovaps	        (%%r9 ),%%ymm9 	\n\t"/* ai */\
		"vmovaps	%c[__i2](%%rsi),%%ymm2						\n\t	vmovaps	%c[__i2](%%r8 ),%%ymm10	\n\t"/* br */\
		"vmovaps	%c[__i2](%%rdi),%%ymm3						\n\t	vmovaps	%c[__i2](%%r9 ),%%ymm11	\n\t"/* bi */\
		"vmovaps	%c[__i1](%%rsi),%%ymm4						\n\t	vmovaps	%c[__i1](%%r8 ),%%ymm12	\n\t"/* cr */\
		"vmovaps	%c[__i1](%%rdi),%%ymm5						\n\t	vmovaps	%c[__i1](%%r9 ),%%ymm13	\n\t"/* ci */\
		"vmovaps	%c[__i3](%%rsi),%%ymm6						\n\t	vmovaps	%c[__i3](%%r8 ),%%ymm14	\n\t"/* dr */\
		"vmovaps	%c[__i3](%%rdi),%%ymm7						\n\t	vmovaps	%c[__i3](%%r9 ),%%ymm15	\n\t"/* di */\
		"																movq	%[__isrt2],%%r14	\n\t"\
		"vsubpd		%%ymm2 ,%%ymm0,%%ymm0						\n\t	vsubpd		%%ymm11,%%ymm8 ,%%ymm8 	\n\t"/* ar-bi */\
		"vsubpd		%%ymm3 ,%%ymm1,%%ymm1						\n\t	vsubpd		%%ymm10,%%ymm9 ,%%ymm9 	\n\t"/* ai-br */\
		"vsubpd		%%ymm6 ,%%ymm4,%%ymm4						\n\t	vsubpd		%%ymm13,%%ymm12,%%ymm12	\n\t"/* cr-ci */\
		"vsubpd		%%ymm7 ,%%ymm5,%%ymm5						\n\t	vsubpd		%%ymm14,%%ymm15,%%ymm15	\n\t"/* di-dr */\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm14	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm2						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm11	\n\t"/* ar+bi */\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm3						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm10	\n\t"/* ai+br */\
	"vfmadd132pd	%%ymm14,%%ymm4,%%ymm6						\n\t	vfmadd132pd	%%ymm14,%%ymm12,%%ymm13	\n\t"/* cr+ci */\
	"vfmadd132pd	%%ymm14,%%ymm5,%%ymm7						\n\t	vfmadd132pd	(%%rax),%%ymm15,%%ymm14	\n\t"/* di+dr */\
		"																	vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"																	vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
	"movq	%[out0],%%r8	\n\t	movq	%[off],%%r9	\n\t"/* Load output base-address into r8 and offset-array pointer into r9 */\
		"movslq		    (%%r9),%%rax	\n\t"/*        off0 */"movslq	0x10(%%r9),%%r10	\n\t"/*        off4 */\
		"movslq		0x04(%%r9),%%rbx	\n\t"/*        off1 */"movslq	0x14(%%r9),%%r11	\n\t"/*        off5 */\
		"movslq		0x08(%%r9),%%rcx	\n\t"/*        off2 */"movslq	0x18(%%r9),%%r12	\n\t"/*        off6 */\
		"movslq		0x0c(%%r9),%%rdx	\n\t"/*        off3 */"movslq	0x1c(%%r9),%%r13	\n\t"/*        off7 */\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* out0 + off0 */"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off4 */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"/* out0 + off1 */"leaq	(%%r8,%%r11,8),%%r11	\n\t"/* out0 + off5 */\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"/* out0 + off2 */"leaq	(%%r8,%%r12,8),%%r12	\n\t"/* out0 + off6 */\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"/* out0 + off3 */"leaq	(%%r8,%%r13,8),%%r13	\n\t"/* out0 + off7 */\
		"vsubpd		%%ymm6,%%ymm2,%%ymm2						\n\t	vfmadd132pd	(%%r15),%%ymm12,%%ymm14		\n\t"\
		"vsubpd		%%ymm7,%%ymm3,%%ymm3						\n\t	vfmadd132pd	(%%r15),%%ymm13,%%ymm15		\n\t"\
		"vsubpd		%%ymm5,%%ymm0,%%ymm0						\n\t	vfnmadd231pd	(%%r14),%%ymm12,%%ymm8 	\n\t"/* x = x - y.isrt2 */\
		"vsubpd		%%ymm4,%%ymm1,%%ymm1						\n\t	vfnmadd231pd	(%%r14),%%ymm13,%%ymm10	\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)							\n\t	vfnmadd231pd	(%%r14),%%ymm14,%%ymm9 	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)							\n\t	vfnmadd231pd	(%%r14),%%ymm15,%%ymm11	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)							\n\t	vmovaps	%%ymm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rdx)							\n\t	vmovaps	%%ymm10,0x20(%%r11)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm2,%%ymm6						\n\t	vmovaps	%%ymm9 ,0x20(%%r13)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm3,%%ymm7						\n\t	vmovaps	%%ymm11,    (%%r12)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm0,%%ymm5						\n\t	vfmadd132pd	-0x20(%%r14),%%ymm8 ,%%ymm12	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	"vfmadd132pd	(%%r15),%%ymm1,%%ymm4						\n\t	vfmadd132pd	-0x20(%%r14),%%ymm10,%%ymm13	\n\t"\
		"vmovaps	%%ymm6,    (%%rax)							\n\t	vfmadd132pd	-0x20(%%r14),%%ymm11,%%ymm15	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)							\n\t	vfmadd132pd	-0x20(%%r14),%%ymm9 ,%%ymm14	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)							\n\t	vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rcx)							\n\t	vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"																vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"																vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
	/* Block 2: */													/* Block 3: */\
		"addq	$%c[__i4],%%rsi	\n\t	leaq %c[__i4](%%rsi),%%r8 \n\t addq $%c[__i4],%%r8 \n\t"/* __in0+[4,c]*ostride */\
		"leaq	0x20(%%rsi),%%rdi								\n\t	leaq	0x20(%%r8 ),%%r9 	\n\t"\
		"vmovaps	%c[__i1](%%rsi),%%ymm4						\n\t	vmovaps	%c[__i1](%%r8 ),%%ymm12	\n\t"\
		"vmovaps	%c[__i3](%%rsi),%%ymm6						\n\t	vmovaps	%c[__i3](%%r8 ),%%ymm14	\n\t"\
		"vmovaps	%c[__i1](%%rdi),%%ymm5						\n\t	vmovaps	%c[__i1](%%r9 ),%%ymm13	\n\t"\
		"vmovaps	%c[__i3](%%rdi),%%ymm7						\n\t	vmovaps	%c[__i3](%%r9 ),%%ymm15	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t	addq	$0x20,%%rdi	\n\t"/* cc0, from isrt2 [rdi,rsi shared by both cols] */\
		"vmovaps	%%ymm4,%%ymm0								\n\t	vmovaps	%%ymm12,%%ymm8 		\n\t"\
	/*	"vmovaps	%%ymm6,%%ymm2								\n\t	vmovaps	%%ymm14,%%ymm10		\n\t"*/\
		"vmovaps	(%%rdi),%%ymm2								\n\t	vmovaps	0x20(%%rdi),%%ymm10	\n\t"/* Instead use these to store [c,s] */\
		"vmovaps	%%ymm5,%%ymm1								\n\t	vmovaps	%%ymm13,%%ymm9 		\n\t"\
		"vmovaps	%%ymm7,%%ymm3								\n\t	vmovaps	%%ymm15,%%ymm11		\n\t"\
		"vmulpd		    %%ymm2 ,%%ymm4,%%ymm4					\n\t	vmulpd		    %%ymm10,%%ymm12,%%ymm12	\n\t"\
		"vmulpd		    %%ymm2 ,%%ymm5,%%ymm5					\n\t	vmulpd		    %%ymm10,%%ymm13,%%ymm13	\n\t"\
		"vmulpd		    %%ymm10,%%ymm6,%%ymm6					\n\t	vmulpd		    %%ymm2 ,%%ymm14,%%ymm14	\n\t"\
		"vmulpd		    %%ymm10,%%ymm7,%%ymm7					\n\t	vmulpd		    %%ymm2 ,%%ymm15,%%ymm15	\n\t"\
	"vfnmadd231pd	    %%ymm10,%%ymm1,%%ymm4				\n\t	vfnmadd231pd	    %%ymm2 ,%%ymm9 ,%%ymm12		\n\t"\
	" vfmadd231pd	    %%ymm10,%%ymm0,%%ymm5				\n\t	 vfmadd231pd	    %%ymm2 ,%%ymm8 ,%%ymm13		\n\t"\
	"vfnmadd231pd	    %%ymm2 ,%%ymm3,%%ymm6				\n\t	vfnmadd231pd	    %%ymm10,%%ymm11,%%ymm14		\n\t"\
	" vfmadd231pd %c[__i3](%%rsi),%%ymm2,%%ymm7				\n\t	 vfmadd231pd %c[__i3](%%r8 ),%%ymm10,%%ymm15	\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4							\n\t	vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5							\n\t	vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
	"vfmadd132pd	(%%r15),%%ymm4,%%ymm6						\n\t	vfmadd132pd	(%%r15),%%ymm12,%%ymm14		\n\t"\
	"vfmadd132pd	(%%r15),%%ymm5,%%ymm7						\n\t	vfmadd132pd	(%%r15),%%ymm13,%%ymm15		\n\t"\
		"leaq	0x20(%%rsi),%%rdi								\n\t	leaq	0x20(%%r8 ),%%r9 	\n\t"\
		"vmovaps	%c[__i2](%%rsi),%%ymm2						\n\t	vmovaps	%c[__i2](%%r8 ),%%ymm10	\n\t"\
		"vmovaps	%c[__i2](%%rdi),%%ymm3						\n\t	vmovaps	%c[__i2](%%r9 ),%%ymm11	\n\t"\
		"vmovaps	        (%%rsi),%%ymm0						\n\t	vmovaps	        (%%r8 ),%%ymm8 	\n\t"\
		"vmovaps	    0x20(%%rsi),%%ymm1						\n\t	vmovaps	    0x20(%%r8 ),%%ymm9 	\n\t"\
		"vsubpd		  %%ymm3,%%ymm2,%%ymm2						\n\t	vaddpd	%%ymm11,%%ymm10,%%ymm10	\n\t"\
		"vaddpd	%c[__i2](%%rsi),%%ymm3,%%ymm3					\n\t	vsubpd	%c[__i2](%%r8 ),%%ymm11,%%ymm11	\n\t"\
		"movq	%[__isrt2],%%r9 	\n\t"\
	"vfnmadd231pd		 (%%r9),%%ymm2,%%ymm0				\n\t	vfnmadd231pd		 (%%r9),%%ymm10,%%ymm8 	\n\t"/* x = x - y.isrt2 */\
	"vfnmadd231pd		 (%%r9),%%ymm3,%%ymm1				\n\t	vfnmadd231pd		 (%%r9),%%ymm11,%%ymm9 	\n\t"\
	" vfmadd132pd	-0x20(%%r9),%%ymm0,%%ymm2				\n\t	 vfmadd132pd	-0x20(%%r9),%%ymm8 ,%%ymm10	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	" vfmadd132pd	-0x20(%%r9),%%ymm1,%%ymm3				\n\t	 vfmadd132pd	-0x20(%%r9),%%ymm9 ,%%ymm11	\n\t"\
	"movq	%[out0],%%r8	\n\t	movq	%[off],%%r9	\n\t"/* Load output base-address into r8 and offset-array pointer into r9 */\
		"movslq		0x20(%%r9),%%rax	\n\t"/*        off8 */"movslq	0x30(%%r9),%%r10	\n\t"/*        offc */\
		"movslq		0x24(%%r9),%%rbx	\n\t"/*        off9 */"movslq	0x34(%%r9),%%r11	\n\t"/*        offd */\
		"movslq		0x28(%%r9),%%rcx	\n\t"/*        offa */"movslq	0x38(%%r9),%%r12	\n\t"/*        offe */\
		"movslq		0x2c(%%r9),%%rdx	\n\t"/*        offb */"movslq	0x3c(%%r9),%%r13	\n\t"/*        offf */\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* out0 + off8 */"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + offc */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"/* out0 + off9 */"leaq	(%%r8,%%r11,8),%%r11	\n\t"/* out0 + offd */\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"/* out0 + offa */"leaq	(%%r8,%%r12,8),%%r12	\n\t"/* out0 + offe */\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"/* out0 + offb */"leaq	(%%r8,%%r13,8),%%r13	\n\t"/* out0 + offf */\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm15,%%ymm10,%%ymm10	\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm14,%%ymm11,%%ymm11	\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm14	\n\t"/* two */\
		"vmovaps	%%ymm2,    (%%rbx)							\n\t	vmovaps	%%ymm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)							\n\t	vmovaps	%%ymm9 ,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)							\n\t	vmovaps	%%ymm10,    (%%r12)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rdx)							\n\t	vmovaps	%%ymm11,0x20(%%r13)	\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm6						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm7						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm5						\n\t	vfmadd132pd	%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm4						\n\t	vfmadd132pd	(%%rax),%%ymm11,%%ymm14		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)							\n\t	vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)							\n\t	vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)							\n\t	vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rcx)							\n\t	vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "e" (Xi1)\
		,[__i2] "e" (Xi2)\
		,[__i3] "e" (Xi3)\
		,[__i4] "e" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// Same as above, but with specifiable I-addresses and regularly spaced O-addresses:
	//
	#define SSE2_RADIX16_DIF_0TWIDDLE_B(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%r15	\n\t"/* two, used for FMA-based double-and-ADD/SUBs */\
	/* Block 0: SSE2_RADIX4_DIF_IN_PLACE(r1 , r17, r9 , r25): */	/* Block 2: SSE2_RADIX4_DIF_IN_PLACE(r5 , r21, r13, r29): */\
	"movq	%[__in0],%%rax	\n\t"/* Note BR of r[abcd]x: b<-->c */	"	leaq	%c[__i2](%%rax),%%r10	\n\t"/* addr += 2*ostride */\
	"leaq	%c[__i4](%%rax),%%rcx	\n\t"/* __in0+  [4*istride] */	"	leaq	%c[__i2](%%rcx),%%r12	\n\t"/* w.r.to to Block 0 */\
	"leaq	%c[__i4](%%rcx),%%rbx	\n\t"/* __in0+2*[4*istride] */	"	leaq	%c[__i2](%%rbx),%%r11	\n\t"/* Note BR of r1[0123]: r11<-->r12 */\
	"leaq	%c[__i4](%%rbx),%%rdx	\n\t"/* __in0+3*[4*istride] */	"	leaq	%c[__i2](%%rdx),%%r13	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0							\n\t	vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1							\n\t	vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	    (%%rax),%%ymm2							\n\t	vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3							\n\t	vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4							\n\t	vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5							\n\t	vmovaps	0x20(%%r13),%%ymm13	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6							\n\t	vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7							\n\t	vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vsubpd		%%ymm0 ,%%ymm2,%%ymm2						\n\t	vsubpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"\
		"vsubpd		%%ymm1 ,%%ymm3,%%ymm3						\n\t	vsubpd		%%ymm9 ,%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm4 ,%%ymm6,%%ymm6						\n\t	vsubpd		%%ymm12,%%ymm14,%%ymm14	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm7,%%ymm7						\n\t	vsubpd		%%ymm13,%%ymm15,%%ymm15	\n\t"\
	"vmovaps	%%ymm13,(%%rax) 	\n\t"/* spill ymm13 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm13	\n\t"/* two */\
	"vfmadd132pd	%%ymm13,%%ymm2,%%ymm0						\n\t	vfmadd132pd	%%ymm13,%%ymm10,%%ymm8 	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm3,%%ymm1						\n\t	vfmadd132pd	%%ymm13,%%ymm11,%%ymm9 	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm6,%%ymm4						\n\t	vfmadd132pd	%%ymm13,%%ymm14,%%ymm12	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm7,%%ymm5						\n\t	vfmadd132pd	(%%rax),%%ymm15,%%ymm13	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)							\n\t	vmovaps	%%ymm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)							\n\t	vmovaps	%%ymm9 ,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)							\n\t	vmovaps	%%ymm10,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)							\n\t	vmovaps	%%ymm11,0x20(%%r13)	\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm14	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm4						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm5						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm7						\n\t	vfmadd132pd	%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm6						\n\t	vfmadd132pd	(%%rax),%%ymm11,%%ymm14		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)							\n\t	vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)							\n\t	vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)							\n\t	vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)							\n\t	vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
	/* Block 1: SSE2_RADIX4_DIF_IN_PLACE(r3 , r19, r11, r27): */	/* Block 3: SSE2_RADIX4_DIF_IN_PLACE(r7 , r23, r15, r31): */\
		"leaq	%c[__i1](%%rax),%%rax	\n\t"/* addr += 1*ostride */"	leaq	%c[__i2](%%rax),%%r10	\n\t"/* addr += 2*ostride */\
		"leaq	%c[__i1](%%rbx),%%rbx	\n\t"/* w.r.to to Block 0 */"	leaq	%c[__i2](%%rbx),%%r11	\n\t"/* w.r.to to Block 1 */\
		"leaq	%c[__i1](%%rcx),%%rcx							\n\t	leaq	%c[__i2](%%rcx),%%r12	\n\t"\
		"leaq	%c[__i1](%%rdx),%%rdx							\n\t	leaq	%c[__i2](%%rdx),%%r13	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0							\n\t	vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1							\n\t	vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	    (%%rax),%%ymm2							\n\t	vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3							\n\t	vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4							\n\t	vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5							\n\t	vmovaps	0x20(%%r13),%%ymm13	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6							\n\t	vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7							\n\t	vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vsubpd		%%ymm0 ,%%ymm2,%%ymm2						\n\t	vsubpd		%%ymm8 ,%%ymm10,%%ymm10	\n\t"\
		"vsubpd		%%ymm1 ,%%ymm3,%%ymm3						\n\t	vsubpd		%%ymm9 ,%%ymm11,%%ymm11	\n\t"\
		"vsubpd		%%ymm4 ,%%ymm6,%%ymm6						\n\t	vsubpd		%%ymm12,%%ymm14,%%ymm14	\n\t"\
		"vsubpd		%%ymm5 ,%%ymm7,%%ymm7						\n\t	vsubpd		%%ymm13,%%ymm15,%%ymm15	\n\t"\
	"vmovaps	%%ymm13,(%%rax) 	\n\t"/* spill ymm13 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm13	\n\t"/* two */\
	"vfmadd132pd	%%ymm13,%%ymm2,%%ymm0						\n\t	vfmadd132pd	%%ymm13,%%ymm10,%%ymm8 	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm3,%%ymm1						\n\t	vfmadd132pd	%%ymm13,%%ymm11,%%ymm9 	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm6,%%ymm4						\n\t	vfmadd132pd	%%ymm13,%%ymm14,%%ymm12	\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm7,%%ymm5						\n\t	vfmadd132pd	(%%rax),%%ymm15,%%ymm13	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)							\n\t	vmovaps	%%ymm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)							\n\t	vmovaps	%%ymm9 ,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)							\n\t	vmovaps	%%ymm10,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)							\n\t	vmovaps	%%ymm11,0x20(%%r13)	\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm14	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm4						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm5						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm7						\n\t	vfmadd132pd	%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm6						\n\t	vfmadd132pd	(%%rax),%%ymm11,%%ymm14		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)							\n\t	vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)							\n\t	vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)							\n\t	vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)							\n\t	vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
	/******************************************************************************/\
	/*** Now do 4 DFTs with internal twiddles on the 4*stride - separated data. ***/\
	/*** Order 0,2,1,3 allows incr-only of rsi-datum from 1 block to the next: ****/\
	/******************************************************************************/\
	/* Block 0: r0-3 */												/* Block 1: r8-b */\
		"movq	%[__in0],%%rsi	\n\t	leaq %c[__i4](%%rsi),%%r8 \n\t leaq %c[__i4](%%r8 ),%%r8 \n\t"/* __in0+8*ostride */\
		"movq	%[__out0],%%rax									\n\t	leaq	0x100(%%rax),%%r10		\n\t"/* out4 */\
		"leaq	0x040(%%rax),%%rbx		/* out1 */				\n\t	leaq	0x040(%%r10),%%r11		\n\t"/* out5 */\
		"leaq	0x080(%%rax),%%rcx		/* out2 */				\n\t	leaq	0x080(%%r10),%%r12		\n\t"/* out6 */\
		"leaq	0x0c0(%%rax),%%rdx		/* out3 */				\n\t	leaq	0x0c0(%%r10),%%r13		\n\t"/* out7 */\
	/* Need separate address for Im parts of outputs due to literal-offsets below: */\
		"leaq	0x20(%%rsi),%%rdi								\n\t	leaq	0x20(%%r8 ),%%r9 	\n\t"\
		"vmovaps	        (%%rsi),%%ymm0						\n\t	vmovaps	        (%%r8 ),%%ymm8 	\n\t"/* ar */\
		"vmovaps	        (%%rdi),%%ymm1						\n\t	vmovaps	        (%%r9 ),%%ymm9 	\n\t"/* ai */\
		"vmovaps	%c[__i2](%%rsi),%%ymm2						\n\t	vmovaps	%c[__i2](%%r8 ),%%ymm10	\n\t"/* br */\
		"vmovaps	%c[__i2](%%rdi),%%ymm3						\n\t	vmovaps	%c[__i2](%%r9 ),%%ymm11	\n\t"/* bi */\
		"vmovaps	%c[__i1](%%rsi),%%ymm4						\n\t	vmovaps	%c[__i1](%%r8 ),%%ymm12	\n\t"/* cr */\
		"vmovaps	%c[__i1](%%rdi),%%ymm5						\n\t	vmovaps	%c[__i1](%%r9 ),%%ymm13	\n\t"/* ci */\
		"vmovaps	%c[__i3](%%rsi),%%ymm6						\n\t	vmovaps	%c[__i3](%%r8 ),%%ymm14	\n\t"/* dr */\
		"vmovaps	%c[__i3](%%rdi),%%ymm7						\n\t	vmovaps	%c[__i3](%%r9 ),%%ymm15	\n\t"/* di */\
		"																movq	%[__isrt2],%%r9 	\n\t"\
		"vsubpd		%%ymm2 ,%%ymm0,%%ymm0						\n\t	vsubpd		%%ymm11,%%ymm8 ,%%ymm8 	\n\t"/* ar-bi */\
		"vsubpd		%%ymm3 ,%%ymm1,%%ymm1						\n\t	vsubpd		%%ymm10,%%ymm9 ,%%ymm9 	\n\t"/* ai-br */\
		"vsubpd		%%ymm6 ,%%ymm4,%%ymm4						\n\t	vsubpd		%%ymm13,%%ymm12,%%ymm12	\n\t"/* cr-ci */\
		"vsubpd		%%ymm7 ,%%ymm5,%%ymm5						\n\t	vsubpd		%%ymm14,%%ymm15,%%ymm15	\n\t"/* di-dr */\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm14	\n\t"/* two */\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm2						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm11	\n\t"/* ar+bi */\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm3						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm10	\n\t"/* ai+br */\
	"vfmadd132pd	%%ymm14,%%ymm4,%%ymm6						\n\t	vfmadd132pd	%%ymm14,%%ymm12,%%ymm13	\n\t"/* cr+ci */\
	"vfmadd132pd	%%ymm14,%%ymm5,%%ymm7						\n\t	vfmadd132pd	(%%rax),%%ymm15,%%ymm14	\n\t"/* di+dr */\
		"																	vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"																	vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
		"vsubpd		%%ymm6,%%ymm2,%%ymm2						\n\t	vfmadd132pd	(%%r15),%%ymm12,%%ymm14		\n\t"\
		"vsubpd		%%ymm7,%%ymm3,%%ymm3						\n\t	vfmadd132pd	(%%r15),%%ymm13,%%ymm15		\n\t"\
		"vsubpd		%%ymm5,%%ymm0,%%ymm0						\n\t	vfnmadd231pd	(%%r9 ),%%ymm12,%%ymm8 	\n\t"/* x = x - y.isrt2 */\
		"vsubpd		%%ymm4,%%ymm1,%%ymm1						\n\t	vfnmadd231pd	(%%r9 ),%%ymm13,%%ymm10	\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)							\n\t	vfnmadd231pd	(%%r9 ),%%ymm14,%%ymm9 	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)							\n\t	vfnmadd231pd	(%%r9 ),%%ymm15,%%ymm11	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)							\n\t	vmovaps	%%ymm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rdx)							\n\t	vmovaps	%%ymm10,0x20(%%r11)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm2,%%ymm6						\n\t	vmovaps	%%ymm9 ,0x20(%%r13)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm3,%%ymm7						\n\t	vmovaps	%%ymm11,    (%%r12)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm0,%%ymm5						\n\t	vfmadd132pd	-0x20(%%r9),%%ymm8 ,%%ymm12	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	"vfmadd132pd	(%%r15),%%ymm1,%%ymm4						\n\t	vfmadd132pd	-0x20(%%r9),%%ymm10,%%ymm13	\n\t"\
		"vmovaps	%%ymm6,    (%%rax)							\n\t	vfmadd132pd	-0x20(%%r9),%%ymm11,%%ymm15	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)							\n\t	vfmadd132pd	-0x20(%%r9),%%ymm9 ,%%ymm14	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)							\n\t	vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rcx)							\n\t	vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"																vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"																vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
	/* Block 2: */													/* Block 3: */\
		"leaq	%c[__i4](%%rsi),%%rsi	\n\t"/* __in0+4*ostride */	"	leaq	%c[__i4](%%r8 ),%%r8 	\n\t"/* __in0+c*ostride */\
		"leaq	0x20(%%rsi),%%rdi								\n\t	leaq	0x20(%%r8 ),%%r9 	\n\t"\
		"addq	$0x200,%%rax		/* out8 */					\n\t	addq	$0x200,%%r10		\n\t"/* outc */\
		"addq	$0x200,%%rbx		/* out9 */					\n\t	addq	$0x200,%%r11		\n\t"/* outd */\
		"addq	$0x200,%%rcx		/* outa */					\n\t	addq	$0x200,%%r12		\n\t"/* oute */\
		"addq	$0x200,%%rdx		/* outb */					\n\t	addq	$0x200,%%r13		\n\t"/* outf */\
		"vmovaps	%c[__i1](%%rsi),%%ymm4						\n\t	vmovaps	%c[__i1](%%r8 ),%%ymm12	\n\t"\
		"vmovaps	%c[__i3](%%rsi),%%ymm6						\n\t	vmovaps	%c[__i3](%%r8 ),%%ymm14	\n\t"\
		"vmovaps	%c[__i1](%%rdi),%%ymm5						\n\t	vmovaps	%c[__i1](%%r9 ),%%ymm13	\n\t"\
		"vmovaps	%c[__i3](%%rdi),%%ymm7						\n\t	vmovaps	%c[__i3](%%r9 ),%%ymm15	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t	addq	$0x20,%%rdi	\n\t"/* cc0, from isrt2 [rdi,rsi shared by both cols] */\
		"vmovaps	%%ymm4,%%ymm0								\n\t	vmovaps	%%ymm12,%%ymm8 		\n\t"\
	/*	"vmovaps	%%ymm6,%%ymm2								\n\t	vmovaps	%%ymm14,%%ymm10		\n\t"*/\
		"vmovaps	(%%rdi),%%ymm2								\n\t	vmovaps	0x20(%%rdi),%%ymm10	\n\t"/* Instead use these to store [c,s] */\
		"vmovaps	%%ymm5,%%ymm1								\n\t	vmovaps	%%ymm13,%%ymm9 		\n\t"\
		"vmovaps	%%ymm7,%%ymm3								\n\t	vmovaps	%%ymm15,%%ymm11		\n\t"\
		"vmulpd		    %%ymm2 ,%%ymm4,%%ymm4					\n\t	vmulpd		    %%ymm10,%%ymm12,%%ymm12	\n\t"\
		"vmulpd		    %%ymm2 ,%%ymm5,%%ymm5					\n\t	vmulpd		    %%ymm10,%%ymm13,%%ymm13	\n\t"\
		"vmulpd		    %%ymm10,%%ymm6,%%ymm6					\n\t	vmulpd		    %%ymm2 ,%%ymm14,%%ymm14	\n\t"\
		"vmulpd		    %%ymm10,%%ymm7,%%ymm7					\n\t	vmulpd		    %%ymm2 ,%%ymm15,%%ymm15	\n\t"\
	"vfnmadd231pd	    %%ymm10,%%ymm1,%%ymm4				\n\t	vfnmadd231pd	    %%ymm2 ,%%ymm9 ,%%ymm12		\n\t"\
	" vfmadd231pd	    %%ymm10,%%ymm0,%%ymm5				\n\t	 vfmadd231pd	    %%ymm2 ,%%ymm8 ,%%ymm13		\n\t"\
	"vfnmadd231pd	    %%ymm2 ,%%ymm3,%%ymm6				\n\t	vfnmadd231pd	    %%ymm10,%%ymm11,%%ymm14		\n\t"\
	" vfmadd231pd %c[__i3](%%rsi),%%ymm2,%%ymm7				\n\t	 vfmadd231pd %c[__i3](%%r8 ),%%ymm10,%%ymm15	\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4							\n\t	vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5							\n\t	vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
	"vfmadd132pd	(%%r15),%%ymm4,%%ymm6						\n\t	vfmadd132pd	(%%r15),%%ymm12,%%ymm14		\n\t"\
	"vfmadd132pd	(%%r15),%%ymm5,%%ymm7						\n\t	vfmadd132pd	(%%r15),%%ymm13,%%ymm15		\n\t"\
		"leaq	0x20(%%rsi),%%rdi								\n\t	leaq	0x20(%%r8 ),%%r9 	\n\t"\
		"vmovaps	%c[__i2](%%rsi),%%ymm2						\n\t	vmovaps	%c[__i2](%%r8 ),%%ymm10	\n\t"\
		"vmovaps	%c[__i2](%%rdi),%%ymm3						\n\t	vmovaps	%c[__i2](%%r9 ),%%ymm11	\n\t"\
		"vmovaps	        (%%rsi),%%ymm0						\n\t	vmovaps	        (%%r8 ),%%ymm8 	\n\t"\
		"vmovaps	    0x20(%%rsi),%%ymm1						\n\t	vmovaps	    0x20(%%r8 ),%%ymm9 	\n\t"\
		"vsubpd		  %%ymm3,%%ymm2,%%ymm2						\n\t	vaddpd	%%ymm11,%%ymm10,%%ymm10	\n\t"\
		"vaddpd	%c[__i2](%%rsi),%%ymm3,%%ymm3					\n\t	vsubpd	%c[__i2](%%r8 ),%%ymm11,%%ymm11	\n\t"\
		"movq	%[__isrt2],%%r9 	\n\t"\
	"vfnmadd231pd		 (%%r9),%%ymm2,%%ymm0				\n\t	vfnmadd231pd		 (%%r9),%%ymm10,%%ymm8 	\n\t"/* x = x - y.isrt2 */\
	"vfnmadd231pd		 (%%r9),%%ymm3,%%ymm1				\n\t	vfnmadd231pd		 (%%r9),%%ymm11,%%ymm9 	\n\t"\
	" vfmadd132pd	-0x20(%%r9),%%ymm0,%%ymm2				\n\t	 vfmadd132pd	-0x20(%%r9),%%ymm8 ,%%ymm10	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	" vfmadd132pd	-0x20(%%r9),%%ymm1,%%ymm3				\n\t	 vfmadd132pd	-0x20(%%r9),%%ymm9 ,%%ymm11	\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm15,%%ymm10,%%ymm10	\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm14,%%ymm11,%%ymm11	\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm14	\n\t"/* two */\
		"vmovaps	%%ymm2,    (%%rbx)							\n\t	vmovaps	%%ymm8 ,    (%%r11)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)							\n\t	vmovaps	%%ymm9 ,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)							\n\t	vmovaps	%%ymm10,    (%%r12)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rdx)							\n\t	vmovaps	%%ymm11,0x20(%%r13)	\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm6						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm7						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm5						\n\t	vfmadd132pd	%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm4						\n\t	vfmadd132pd	(%%rax),%%ymm11,%%ymm14		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)							\n\t	vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)							\n\t	vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)							\n\t	vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rcx)							\n\t	vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "e" (Xi1)\
		,[__i2] "e" (Xi2)\
		,[__i3] "e" (Xi3)\
		,[__i4] "e" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// Based on the SSE2_RADIX16_DIT_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-input addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	// We use just a single output base-pointer plus literal ostrides which are [1,2,3,4]-multiples of
	// __01; this allows us to cut GP-register usage, which is absolutely a must for the 32-bit version
	// of the macro, and is a benefit to the 64-bit versions which code-fold to yield 2 side-by-side
	// streams of independently executable instructions, one for data in xmm0-7, the other using xmm8-15.
	//
	#define SSE2_RADIX16_DIT_0TWIDDLE(Xin0,Xoff, Xisrt2,Xtwo, Xout0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
		/* Jan 2021: In reduced-#args revamp of this macro, Due to GPR-usage constraints, instead of
		keeping a ptr to vec_dbl*two in GPR r15 throughout, create 2.0 in low double, of an XMM, copy
		to an MMX via MOVDQ2Q, and copy-back-via-MOVQ2DQ/shuffle-propagate-to-rest-of-XMM as needed: */\
		"vpcmpeqw	%%xmm0,%%xmm0,%%xmm0\n\t"\
		"vpsllq		$63,%%xmm0,%%xmm0	\n\t"\
		"vpsrlq		$01,%%xmm0,%%xmm0	\n\t"\
		"movdq2q	%%xmm0,%%mm0		\n\t"\
	"movq	%[in0],%%r14	\n\t	movq	%[off],%%r15	\n\t"/* Load input base-address into r14 and int32[16] offset-array pointer into r15 */\
		"movslq		    (%%r15),%%rax	\n\t	movslq		0x20(%%r15),%%r10	\n\t"/* off[0-3],[8-b] */\
		"movslq		0x04(%%r15),%%rbx	\n\t	movslq		0x24(%%r15),%%r11	\n\t"\
		"movslq		0x08(%%r15),%%rcx	\n\t	movslq		0x28(%%r15),%%r12	\n\t"\
		"movslq		0x0c(%%r15),%%rdx	\n\t	movslq		0x2c(%%r15),%%r13	\n\t"\
		"leaq	(%%r14,%%rax,8),%%rax	\n\t	leaq	(%%r14,%%r10,8),%%r10	\n\t"/* in0 + off[0-3],[8-b] */\
		"leaq	(%%r14,%%rbx,8),%%rbx	\n\t	leaq	(%%r14,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r14,%%rcx,8),%%rcx	\n\t	leaq	(%%r14,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r14,%%rdx,8),%%rdx	\n\t	leaq	(%%r14,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%rax)	\n\t"\
		/* Need separate address Im parts of outputs due to literal-offsets below */\
		"movq	%[__out0],%%rsi									\n\t	leaq	%c[__o4](%%rsi),%%r8 	\n\t"\
		"leaq	0x20(%%rsi),%%rdi								\n\t	addq	$%c[__o4],%%r8	 	\n\t"/* out0+8*ostride */\
		"																leaq	0x20(%%r8 ),%%r9 	\n\t"\
		/* SSE2_RADIX4_DIT_0TWIDDLE_B(r0 ): */							/* SSE2_RADIX4_DIT_0TWIDDLE_B(r16): */\
		"vmovaps	    (%%rax),%%ymm2							\n\t	vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6							\n\t	vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3							\n\t	vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7							\n\t	vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0							\n\t	vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4							\n\t	vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1							\n\t	vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5							\n\t	vmovaps	0x20(%%r13),%%ymm13	\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm8 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6							\n\t	vsubpd	%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm9 ,%%ymm11,%%ymm11		\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7							\n\t	vsubpd	%%ymm13,%%ymm15,%%ymm15		\n\t"\
	"vmovaps	%%ymm13,(%%rax) 	\n\t"/* spill ymm13 to make room for 2.0 */\
	"movq2dq	%%mm0,%%xmm13	\n\t	vbroadcastsd	%%xmm13,%%ymm13	\n\t"/* reload 2.0 from MMX0 */\
	"vfmadd132pd	%%ymm13,%%ymm2,%%ymm0						\n\t	vfmadd132pd	%%ymm13,%%ymm10,%%ymm8 		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm6,%%ymm4						\n\t	vfmadd132pd	%%ymm13,%%ymm14,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm3,%%ymm1						\n\t	vfmadd132pd	%%ymm13,%%ymm11,%%ymm9 		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm7,%%ymm5						\n\t	vfmadd132pd	(%%rax),%%ymm15,%%ymm13		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
	"prefetcht1	0x100(%%rcx)								\n\t	prefetcht1	0x100(%%r12)	\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */\
	"movq2dq	%%mm0,%%xmm14	\n\t	vbroadcastsd	%%xmm14,%%ymm14	\n\t"/* reload 2.0 from MMX0 */\
		"vmovaps	%%ymm0,%c[__o2](%%rsi)						\n\t	vmovaps	%%ymm8 ,%c[__o2](%%r8 )	\n\t"\
		"vmovaps	%%ymm2,%c[__o3](%%rsi)						\n\t	vmovaps	%%ymm10,%c[__o3](%%r8 )	\n\t"\
		"vmovaps	%%ymm1,%c[__o2](%%rdi)						\n\t	vmovaps	%%ymm9 ,%c[__o2](%%r9 )	\n\t"\
		"vmovaps	%%ymm3,%c[__o1](%%rdi)						\n\t	vmovaps	%%ymm11,%c[__o1](%%r9 )	\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm4						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm7						\n\t	vfmadd132pd	%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm5						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm6						\n\t	vfmadd132pd	(%%rax),%%ymm11,%%ymm14		\n\t"\
		"vmovaps	%%ymm4,        (%%rsi)						\n\t	vmovaps	%%ymm12,        (%%r8 )	\n\t"\
		"vmovaps	%%ymm7,%c[__o1](%%rsi)						\n\t	vmovaps	%%ymm15,%c[__o1](%%r8 )	\n\t"\
		"vmovaps	%%ymm5,        (%%rdi)						\n\t	vmovaps	%%ymm13,        (%%r9 )	\n\t"\
		"vmovaps	%%ymm6,%c[__o3](%%rdi)						\n\t	vmovaps	%%ymm14,%c[__o3](%%r9 )	\n\t"\
	"prefetcht1	0x100(%%rax)									\n\tprefetcht1	0x100(%%r10)\n\t"\
		/* SSE2_RADIX4_DIT_0TWIDDLE_B(r8 ): */						/* SSE2_RADIX4_DIT_0TWIDDLE_B(r24): */\
		"movslq		0x10(%%r15),%%rax	\n\t	movslq		0x30(%%r15),%%r10	\n\t"/* off[4-7],[c-f] */\
		"movslq		0x14(%%r15),%%rbx	\n\t	movslq		0x34(%%r15),%%r11	\n\t"\
		"movslq		0x18(%%r15),%%rcx	\n\t	movslq		0x38(%%r15),%%r12	\n\t"\
		"movslq		0x1c(%%r15),%%rdx	\n\t	movslq		0x3c(%%r15),%%r13	\n\t"\
		"leaq	(%%r14,%%rax,8),%%rax	\n\t	leaq	(%%r14,%%r10,8),%%r10	\n\t"/* in0 + off[4-7],[c-f] */\
		"leaq	(%%r14,%%rbx,8),%%rbx	\n\t	leaq	(%%r14,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r14,%%rcx,8),%%rcx	\n\t	leaq	(%%r14,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r14,%%rdx,8),%%rdx	\n\t	leaq	(%%r14,%%r13,8),%%r13	\n\t"\
		"addq	$%c[__o4],%%rsi			\n\t	addq	$%c[__o4],%%r8	 	\n\t"/* out0+[4,c]*ostride */\
		"vmovaps	    (%%rax),%%ymm2							\n\t	vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6							\n\t	vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3							\n\t	vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7							\n\t	vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0							\n\t	vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4							\n\t	vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1							\n\t	vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5							\n\t	vmovaps	0x20(%%r13),%%ymm13	\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm8 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6							\n\t	vsubpd	%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm9 ,%%ymm11,%%ymm11		\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7							\n\t	vsubpd	%%ymm13,%%ymm15,%%ymm15		\n\t"\
	"vmovaps	%%ymm13,(%%rax) 	\n\t"/* spill ymm13 to make room for 2.0 */\
	"movq2dq	%%mm0,%%xmm13	\n\t	vbroadcastsd	%%xmm13,%%ymm13	\n\t"/* reload 2.0 from MMX0 */\
	"vfmadd132pd	%%ymm13,%%ymm2,%%ymm0						\n\t	vfmadd132pd	%%ymm13,%%ymm10,%%ymm8 		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm6,%%ymm4						\n\t	vfmadd132pd	%%ymm13,%%ymm14,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm3,%%ymm1						\n\t	vfmadd132pd	%%ymm13,%%ymm11,%%ymm9 		\n\t"\
	"vfmadd132pd	%%ymm13,%%ymm7,%%ymm5						\n\t	vfmadd132pd	(%%rax),%%ymm15,%%ymm13		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
	"prefetcht1	0x100(%%rcx)								\n\t	prefetcht1	0x100(%%r12)	\n\t"\
		"leaq	0x20(%%rsi),%%rdi								\n\t	leaq	0x20(%%r8 ),%%r9 	\n\t"\
	"vmovaps	%%ymm14,(%%rax) 	\n\t"/* spill ymm14 to make room for 2.0 */\
	"movq2dq	%%mm0,%%xmm14	\n\t	vbroadcastsd	%%xmm14,%%ymm14	\n\t"/* reload 2.0 from MMX0 */\
		"vmovaps	%%ymm0,%c[__o2](%%rsi)						\n\t	vmovaps	%%ymm8 ,%c[__o2](%%r8 )	\n\t"\
		"vmovaps	%%ymm2,%c[__o3](%%rsi)						\n\t	vmovaps	%%ymm10,%c[__o3](%%r8 )	\n\t"\
		"vmovaps	%%ymm1,%c[__o2](%%rdi)						\n\t	vmovaps	%%ymm9 ,%c[__o2](%%r9 )	\n\t"\
		"vmovaps	%%ymm3,%c[__o1](%%rdi)						\n\t	vmovaps	%%ymm11,%c[__o1](%%r9 )	\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm0,%%ymm4						\n\t	vfmadd132pd	%%ymm14,%%ymm8 ,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm2,%%ymm7						\n\t	vfmadd132pd	%%ymm14,%%ymm10,%%ymm15		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm1,%%ymm5						\n\t	vfmadd132pd	%%ymm14,%%ymm9 ,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm14,%%ymm3,%%ymm6						\n\t	vfmadd132pd	(%%rax),%%ymm11,%%ymm14		\n\t"\
		"vmovaps	%%ymm4,        (%%rsi)						\n\t	vmovaps	%%ymm12,        (%%r8 )	\n\t"\
		"vmovaps	%%ymm7,%c[__o1](%%rsi)						\n\t	vmovaps	%%ymm15,%c[__o1](%%r8 )	\n\t"\
		"vmovaps	%%ymm5,        (%%rdi)						\n\t	vmovaps	%%ymm13,        (%%r9 )	\n\t"\
		"vmovaps	%%ymm6,%c[__o3](%%rdi)						\n\t	vmovaps	%%ymm14,%c[__o3](%%r9 )	\n\t"\
	"prefetcht1	0x100(%%rax)	\n\t"\
	/******************************************************************************/\
	/*** Now do 4 DFTs with internal twiddles on the 4*stride - separated data: ***/\
	/******************************************************************************/\
		"movq	%[__two],%%r15	\n\t"/* No longer need r14,r15 for I-addressing */\
		/* Block 0: */												/* Block 2: */\
		"movq	%[__out0],%%rax									\n\t	leaq	%c[__o2](%%rax),%%r10	\n\t"/* All addresses += 2*ostride */\
	"leaq %c[__o4](%%rax),%%rbx \n\t"/* out0+  [4*ostride] */"	\n\t	leaq	%c[__o2](%%rbx),%%r11	\n\t"\
	"leaq %c[__o4](%%rbx),%%rcx \n\t"/* out0+2*[4*ostride] */"	\n\t	leaq	%c[__o2](%%rcx),%%r12	\n\t"\
	"leaq %c[__o4](%%rcx),%%rdx \n\t"/* out0+3*[4*ostride] */"	\n\t	leaq	%c[__o2](%%rdx),%%r13	\n\t"\
		"														\n\t	movq	%[__isrt2],%%rdi	\n\t"\
		"vmovaps	    (%%rax),%%ymm0							\n\t	vmovaps	    (%%r10),%%ymm8 	\n\t"/* ar */\
		"vmovaps	0x20(%%rax),%%ymm1							\n\t	vmovaps	0x20(%%r10),%%ymm9 	\n\t"/* ai */\
		"vmovaps	    (%%rbx),%%ymm2							\n\t	vmovaps	    (%%r11),%%ymm10	\n\t"/* br */\
		"vmovaps	0x20(%%rbx),%%ymm3							\n\t	vmovaps	0x20(%%r11),%%ymm11	\n\t"/* bi */\
		"vmovaps	    (%%rcx),%%ymm4							\n\t	vmovaps	    (%%r12),%%ymm12	\n\t"/* cr */\
		"vmovaps	0x20(%%rcx),%%ymm5							\n\t	vmovaps	0x20(%%r12),%%ymm13	\n\t"/* ci */\
		"vmovaps	    (%%rdx),%%ymm6							\n\t	vmovaps	    (%%r13),%%ymm14	\n\t"/* dr */\
		"vmovaps	0x20(%%rdx),%%ymm7							\n\t	vmovaps	0x20(%%r13),%%ymm15	\n\t"/* di */\
		"vsubpd	    %%ymm2 ,%%ymm0,%%ymm0						\n\t	vsubpd	    %%ymm11,%%ymm8 ,%%ymm8 	\n\t"/* ar-bi */\
		"vsubpd	    %%ymm3 ,%%ymm1,%%ymm1						\n\t	vsubpd	    %%ymm10,%%ymm9 ,%%ymm9 	\n\t"/* ai-br */\
		"vsubpd	    %%ymm6 ,%%ymm4,%%ymm4						\n\t	vsubpd	    %%ymm12,%%ymm13,%%ymm13	\n\t"/* ci-cr */\
		"vsubpd	    %%ymm7 ,%%ymm5,%%ymm5						\n\t	vsubpd	    %%ymm15,%%ymm14,%%ymm14	\n\t"/* dr-di */\
	"vmovaps	%%ymm15,(%%rax) 	\n\t"/* spill ymm15 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm15	\n\t"/* two */\
		"vfmadd132pd	%%ymm15,%%ymm0,%%ymm2					\n\t	vfmadd132pd	%%ymm15,%%ymm8 ,%%ymm11	\n\t"/* ar+bi */\
		"vfmadd132pd	%%ymm15,%%ymm1,%%ymm3					\n\t	vfmadd132pd	%%ymm15,%%ymm9 ,%%ymm10	\n\t"/* ai+br */\
		"vfmadd132pd	%%ymm15,%%ymm4,%%ymm6					\n\t	vfmadd132pd	%%ymm15,%%ymm13,%%ymm12	\n\t"/* ci+cr */\
		"vfmadd132pd	%%ymm15,%%ymm5,%%ymm7					\n\t	vfmadd132pd	(%%rax),%%ymm14,%%ymm15	\n\t"/* dr+di */\
		"														\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"														\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0							\n\t	vfmadd132pd	(%%r15),%%ymm12,%%ymm14		\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1							\n\t	vfmadd132pd	(%%r15),%%ymm13,%%ymm15		\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2							\n\t	vfnmadd231pd	(%%rdi),%%ymm12,%%ymm11		\n\t"/* x = x - y.isrt2 */\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3							\n\t	vfnmadd231pd	(%%rdi),%%ymm13,%%ymm9 		\n\t"\
		"vmovaps	%%ymm0,    (%%rdx)							\n\t	vfnmadd231pd	(%%rdi),%%ymm14,%%ymm10		\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)							\n\t	vfnmadd231pd	(%%rdi),%%ymm15,%%ymm8 		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)							\n\t		vmovaps	%%ymm11,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rcx)							\n\t		vmovaps	%%ymm9 ,0x20(%%r12)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm0,%%ymm5						\n\t		vmovaps	%%ymm10,0x20(%%r11)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm1,%%ymm4						\n\t		vmovaps	%%ymm8 ,    (%%r13)	\n\t"\
	"vfmadd132pd	(%%r15),%%ymm2,%%ymm6						\n\t	vfmadd132pd	-0x20(%%rdi),%%ymm11,%%ymm12	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	"vfmadd132pd	(%%r15),%%ymm3,%%ymm7						\n\t	vfmadd132pd	-0x20(%%rdi),%%ymm9 ,%%ymm13	\n\t"\
		"vmovaps	%%ymm5,    (%%rbx)							\n\t	vfmadd132pd	-0x20(%%rdi),%%ymm10,%%ymm14	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rdx)							\n\t	vfmadd132pd	-0x20(%%rdi),%%ymm8 ,%%ymm15	\n\t"\
		"vmovaps	%%ymm6,    (%%rax)							\n\t		vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)							\n\t		vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"														\n\t		vmovaps	%%ymm14,0x20(%%r13)	\n\t"\
		"														\n\t		vmovaps	%%ymm15,    (%%r11)	\n\t"\
		/* Block 1: */												/* Block 3: */\
	"addq $%c[__o1],%%rax\n\t"/* addr += 1*ostride */"	\n\t	leaq	%c[__o2](%%rax),%%r10	\n\t"/* All addresses += 1*ostride */\
	"addq $%c[__o1],%%rbx\n\t"/* relative to Block 0 */"\n\t	leaq	%c[__o2](%%rbx),%%r11	\n\t"/* relative to Block 1 */\
	"addq $%c[__o1],%%rcx								\n\t	leaq	%c[__o2](%%rcx),%%r12	\n\t"\
	"addq $%c[__o1],%%rdx								\n\t	leaq	%c[__o2](%%rdx),%%r13	\n\t"\
		"leaq	0x20(%%rdi),%%rsi	\n\t"/* cc0, from isrt2 [rdi,rsi shared by both cols] */\
		"vmovaps	    (%%rdx),%%ymm0							\n\t	vmovaps	    (%%r13),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1							\n\t	vmovaps	0x20(%%r13),%%ymm9 	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4							\n\t	vmovaps	    (%%r12),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5							\n\t	vmovaps	0x20(%%r12),%%ymm13	\n\t"\
		"vmovaps	    %%ymm0 ,%%ymm2							\n\t	vmovaps	    %%ymm8 ,%%ymm10	\n\t"\
		"vmovaps	    %%ymm1 ,%%ymm3							\n\t	vmovaps	    %%ymm9 ,%%ymm11	\n\t"\
		"vmovaps	    %%ymm4 ,%%ymm6							\n\t	vmovaps	    %%ymm12,%%ymm14	\n\t"\
	/*	"vmovaps	    %%ymm5 ,%%ymm7							\n\t	vmovaps	    %%ymm13,%%ymm15	\n\t"*/\
		"vmovaps	0x20(%%rsi),%%ymm7							\n\t	vmovaps	    (%%rsi),%%ymm15	\n\t"/* Instead use these to store [c,s] */\
		"vmulpd		    %%ymm7 ,%%ymm0,%%ymm0					\n\t	vmulpd		    %%ymm15,%%ymm8 ,%%ymm8 	\n\t"\
		"vmulpd		    %%ymm7 ,%%ymm1,%%ymm1					\n\t	vmulpd		    %%ymm15,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd		    %%ymm15,%%ymm4,%%ymm4					\n\t	vmulpd		    %%ymm7 ,%%ymm12,%%ymm12	\n\t"\
		"vmulpd		    %%ymm15,%%ymm5,%%ymm5					\n\t	vmulpd		    %%ymm7 ,%%ymm13,%%ymm13	\n\t"\
	"vfnmadd231pd	    %%ymm15,%%ymm2,%%ymm1				\n\t	vfnmadd231pd	    %%ymm7 ,%%ymm10,%%ymm9 	\n\t"\
	" vfmadd231pd	    %%ymm15,%%ymm3,%%ymm0				\n\t	 vfmadd231pd	    %%ymm7 ,%%ymm11,%%ymm8 	\n\t"\
	"vfnmadd231pd	    %%ymm7 ,%%ymm6,%%ymm5				\n\t	vfnmadd231pd	    %%ymm15,%%ymm14,%%ymm13	\n\t"\
	" vfmadd231pd	0x20(%%rcx),%%ymm7,%%ymm4				\n\t	 vfmadd231pd	0x20(%%r12),%%ymm15,%%ymm12	\n\t"\
		"vmovaps	%%ymm5,%%ymm7								\n\t	vmovaps	%%ymm13,%%ymm15		\n\t"\
		"vmovaps	%%ymm4,%%ymm6								\n\t	vmovaps	%%ymm12,%%ymm14		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4							\n\t	vaddpd	%%ymm8 ,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5							\n\t	vaddpd	%%ymm9 ,%%ymm13,%%ymm13		\n\t"\
		"vsubpd	%%ymm0,%%ymm6,%%ymm6							\n\t	vsubpd	%%ymm8 ,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm1,%%ymm7,%%ymm7							\n\t	vsubpd	%%ymm9 ,%%ymm15,%%ymm15		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2							\n\t	vmovaps	    (%%r11),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3							\n\t	vmovaps	0x20(%%r11),%%ymm11	\n\t"\
		"vmovaps	    (%%rax),%%ymm0							\n\t	vmovaps	    (%%r10),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1							\n\t	vmovaps	0x20(%%r10),%%ymm9 	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm2,%%ymm2						\n\t	vsubpd	0x20(%%r11),%%ymm10,%%ymm10	\n\t"\
		"vsubpd	    (%%rbx),%%ymm3,%%ymm3						\n\t	vaddpd	    (%%r11),%%ymm11,%%ymm11	\n\t"\
	"vfnmadd231pd		 (%%rdi),%%ymm2,%%ymm0				\n\t	vfnmadd231pd		 (%%rdi),%%ymm10,%%ymm8 	\n\t"/* x = x - y.isrt2 */\
	"vfnmadd231pd		 (%%rdi),%%ymm3,%%ymm1				\n\t	vfnmadd231pd		 (%%rdi),%%ymm11,%%ymm9 	\n\t"\
	" vfmadd132pd	-0x20(%%rdi),%%ymm0,%%ymm2				\n\t	 vfmadd132pd	-0x20(%%rdi),%%ymm8 ,%%ymm10	\n\t"/* y = x + y.sqrt2 = x + y.isrt2 */\
	" vfmadd132pd	-0x20(%%rdi),%%ymm1,%%ymm3				\n\t	 vfmadd132pd	-0x20(%%rdi),%%ymm9 ,%%ymm11	\n\t"\
		"vsubpd	%%ymm7,%%ymm0,%%ymm0							\n\t	vsubpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm6,%%ymm1,%%ymm1							\n\t	vsubpd	%%ymm12,%%ymm11,%%ymm11		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2							\n\t	vsubpd	%%ymm14,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3							\n\t	vsubpd	%%ymm15,%%ymm9 ,%%ymm9 		\n\t"\
		"vmovaps	%%ymm0,    (%%rdx)							\n\t	vmovaps	%%ymm10,    (%%r13)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)							\n\t	vmovaps	%%ymm11,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)							\n\t	vmovaps	%%ymm8 ,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rcx)							\n\t	vmovaps	%%ymm9 ,0x20(%%r12)	\n\t"\
	"vmovaps	%%ymm15,(%%rax) 	\n\t"/* spill ymm15 to make room for 2.0 */"	vmovaps	(%%r15),%%ymm15	\n\t"/* two */\
	"vfmadd132pd	%%ymm15,%%ymm0,%%ymm7						\n\t	vfmadd132pd	%%ymm15,%%ymm10,%%ymm13		\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm1,%%ymm6						\n\t	vfmadd132pd	%%ymm15,%%ymm11,%%ymm12		\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm2,%%ymm4						\n\t	vfmadd132pd	%%ymm15,%%ymm8 ,%%ymm14		\n\t"\
	"vfmadd132pd	%%ymm15,%%ymm3,%%ymm5						\n\t	vfmadd132pd	(%%rax),%%ymm9 ,%%ymm15		\n\t"\
		"vmovaps	%%ymm7,    (%%rbx)							\n\t	vmovaps	%%ymm13,    (%%r11)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rdx)							\n\t	vmovaps	%%ymm12,0x20(%%r13)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)							\n\t	vmovaps	%%ymm14,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)							\n\t	vmovaps	%%ymm15,0x20(%%r10)	\n\t"\
		:					/* outputs: none */\
		:[in0] "m" (Xin0)	/* Input-address-16-tet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 16 double* index offsets */\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		,[__o1] "e" (Xo1)\
		,[__o2] "e" (Xo2)\
		,[__o3] "e" (Xo3)\
		,[__o4] "e" (Xo4)\
		: "cc","memory","mm0","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

  #else	// USE_AVX2 = False, i.e. AVX sans FMA3:

	/* Complex multiply of 2 roots of unity - use e.g. for "multiply up" of sincos twiddles. */
	#define SSE2_CMUL_EXPO(XcA,XcB,XcAmB,XcApB)\
	{\
	__asm__ volatile (\
		"movq	%[__cA]		,%%rax\n\t"\
		"movq	%[__cB]		,%%rbx\n\t"\
		"movq	%[__cAmB]	,%%rcx\n\t"\
		"movq	%[__cApB]	,%%rdx\n\t"\
		"\n\t"\
		"vmovaps	    (%%rax),%%ymm0\n\t"\
		"vmovaps	0x20(%%rax),%%ymm2\n\t"\
		"vmovaps	    (%%rbx),%%ymm4\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5\n\t"\
		"vmovaps	%%ymm0,%%ymm1\n\t"\
		"vmovaps	%%ymm2,%%ymm3\n\t"\
		"\n\t"\
		"vmulpd	%%ymm4,%%ymm0,%%ymm0\n\t"\
		"vmulpd	%%ymm5,%%ymm1,%%ymm1\n\t"\
		"vmulpd	%%ymm4,%%ymm2,%%ymm2\n\t"\
		"vmulpd	%%ymm5,%%ymm3,%%ymm3\n\t"\
		"vmovaps	%%ymm0,%%ymm4\n\t"\
		"vmovaps	%%ymm1,%%ymm5\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1\n\t"\
		"vsubpd	%%ymm3,%%ymm4,%%ymm4\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)\n\t"\
		"vmovaps	%%ymm4,    (%%rdx)\n\t"\
		"vmovaps	%%ymm5,0x20(%%rdx)\n\t"\
		:					/* outputs: none */\
		: [__cA]  "m" (XcA)	/* All inputs from memory addresses here */\
		 ,[__cB]  "m" (XcB)\
		 ,[__cAmB] "m" (XcAmB)\
		 ,[__cApB] "m" (XcApB)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_03_DFT_X2(Xcc0, Xi0,Xi1,Xi2, Xo0,Xo1,Xo2, Xj0,Xj1,Xj2, Xu0,Xu1,Xu2)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax				\n\t	movq	%[__j0],%%r10		\n\t"\
		"movq	%[__i1],%%rbx				\n\t	movq	%[__j1],%%r11		\n\t"\
		"movq	%[__i2],%%rcx				\n\t	movq	%[__j2],%%r12		\n\t"\
		"movq	%[__cc0],%%rdx				\n\t"\
		"vmovaps	    (%%rbx),%%ymm2		\n\t	vmovaps	    (%%r11),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3		\n\t	vmovaps	0x20(%%r11),%%ymm11	\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t	vmovaps	    (%%r10),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t	vmovaps	0x20(%%r10),%%ymm9 	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6		\n\t	vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7		\n\t	vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vmovaps	%%ymm2,%%ymm4			\n\t	vmovaps	%%ymm10,%%ymm12		\n\t"\
		"vmovaps	%%ymm3,%%ymm5			\n\t	vmovaps	%%ymm11,%%ymm13		\n\t"\
		"movq	%[__o0],%%rax				\n\t	movq	%[__u0],%%r10		\n\t"\
		"movq	%[__o1],%%rbx				\n\t	movq	%[__u1],%%r11		\n\t"\
		"movq	%[__o2],%%rcx				\n\t	movq	%[__u2],%%r12		\n\t"\
		"vaddpd	%%ymm6,%%ymm2,%%ymm2		\n\t	vaddpd	%%ymm14,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm7,%%ymm3,%%ymm3		\n\t	vaddpd	%%ymm15,%%ymm11,%%ymm11		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t	vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t	vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0		\n\t	vaddpd	%%ymm10,%%ymm8 ,%%ymm8 		\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1		\n\t	vaddpd	%%ymm11,%%ymm9 ,%%ymm9 		\n\t"\
		"vmovaps	    (%%rdx),%%ymm6		\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7		\n\t"\
		"vmovaps	%%ymm0,    (%%rax)		\n\t	vmovaps	%%ymm8 ,    (%%r10)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rax)		\n\t	vmovaps	%%ymm9 ,0x20(%%r10)	\n\t"\
		"vmulpd	%%ymm6,%%ymm2,%%ymm2		\n\t	vmulpd	%%ymm6 ,%%ymm10,%%ymm10		\n\t"\
		"vmulpd	%%ymm6,%%ymm3,%%ymm3		\n\t	vmulpd	%%ymm6 ,%%ymm11,%%ymm11		\n\t"\
		"vmulpd	%%ymm7,%%ymm4,%%ymm4		\n\t	vmulpd	%%ymm7 ,%%ymm12,%%ymm12		\n\t"\
		"vmulpd	%%ymm7,%%ymm5,%%ymm5		\n\t	vmulpd	%%ymm7 ,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t	vaddpd	%%ymm8 ,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t	vaddpd	%%ymm9 ,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	%%ymm2,%%ymm0			\n\t	vmovaps	%%ymm10,%%ymm8 		\n\t"\
		"vmovaps	%%ymm3,%%ymm1			\n\t	vmovaps	%%ymm11,%%ymm9 		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t	vsubpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm4,%%ymm3,%%ymm3		\n\t	vaddpd	%%ymm12,%%ymm11,%%ymm11		\n\t"\
		"vaddpd	%%ymm5,%%ymm0,%%ymm0		\n\t	vaddpd	%%ymm13,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vsubpd	%%ymm12,%%ymm9 ,%%ymm9 		\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)		\n\t	vmovaps	%%ymm10,    (%%r11)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)		\n\t	vmovaps	%%ymm11,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)		\n\t	vmovaps	%%ymm8 ,    (%%r12)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)		\n\t	vmovaps	%%ymm9 ,0x20(%%r12)	\n\t"\
		:					/* outputs: none */\
		: [__cc0] "m" (Xcc0)	/* All inputs from memory addresses here */\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		: "cc","memory","rax","rbx","rcx","rdx","r10","r11","r12","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_05_DFT_0TWIDDLE_X2(Xcc1,Xtwo, Xi0,Xi1,Xi2,Xi3,Xi4, Xo0,Xo1,Xo2,Xo3,Xo4, Xj0,Xj1,Xj2,Xj3,Xj4, Xu0,Xu1,Xu2,Xu3,Xu4)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rsi				\n\t	movq	%[__j0],%%r10		\n\t"\
		"movq	%[__i1],%%rax				\n\t	movq	%[__j1],%%r11		\n\t"\
		"movq	%[__i2],%%rbx				\n\t	movq	%[__j2],%%r12		\n\t"\
		"movq	%[__i3],%%rcx				\n\t	movq	%[__j3],%%r13		\n\t"\
		"movq	%[__i4],%%rdx				\n\t	movq	%[__j4],%%r14		\n\t"\
		"movq	%[__o0],%%rdi				\n\t	movq	%[__u0],%%r15		\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t	vmovaps	    (%%r11),%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t	vmovaps	0x20(%%r11),%%ymm9 	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2		\n\t	vmovaps	    (%%r12),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3		\n\t	vmovaps	0x20(%%r12),%%ymm11	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4		\n\t	vmovaps	    (%%r13),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5		\n\t	vmovaps	0x20(%%r13),%%ymm13	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6		\n\t	vmovaps	    (%%r14),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7		\n\t	vmovaps	0x20(%%r14),%%ymm15	\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t	vsubpd	%%ymm14,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t	vsubpd	%%ymm15,%%ymm9 ,%%ymm9 		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t	vaddpd	%%ymm14,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t	vaddpd	%%ymm15,%%ymm15,%%ymm15		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t	vaddpd	%%ymm8 ,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t	vaddpd	%%ymm9 ,%%ymm15,%%ymm15		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t	vsubpd	%%ymm12,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t	vsubpd	%%ymm13,%%ymm11,%%ymm11		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t	vaddpd	%%ymm12,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t	vaddpd	%%ymm13,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm2,%%ymm4,%%ymm4		\n\t	vaddpd	%%ymm10,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm3,%%ymm5,%%ymm5		\n\t	vaddpd	%%ymm11,%%ymm13,%%ymm13		\n\t"\
		"movq	%[__cc1],%%rax				\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6		\n\t	vsubpd	%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7		\n\t	vsubpd	%%ymm13,%%ymm15,%%ymm15		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t	vaddpd	%%ymm12,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t	vaddpd	%%ymm13,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4		\n\t	vaddpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5		\n\t	vaddpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	    (%%rsi),%%ymm4,%%ymm4	\n\t	vaddpd	    (%%r10),%%ymm12,%%ymm12	\n\t"\
		"vaddpd	0x20(%%rsi),%%ymm5,%%ymm5	\n\t	vaddpd	0x20(%%r10),%%ymm13,%%ymm13	\n\t"\
		"vmovaps	%%ymm4,    (%%rdi)		\n\t	vmovaps	%%ymm12,    (%%r15)		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rdi)		\n\t	vmovaps	%%ymm13,0x20(%%r15)		\n\t"\
		"vmulpd	0x20(%%rax),%%ymm6,%%ymm6	\n\t	vmulpd	0x20(%%rax),%%ymm14,%%ymm14	\n\t"\
		"vmulpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t	vmulpd	0x20(%%rax),%%ymm15,%%ymm15	\n\t"\
		"vsubpd	    (%%rsi),%%ymm4,%%ymm4	\n\t	vsubpd	    (%%r10),%%ymm12,%%ymm12	\n\t"\
		"vsubpd	0x20(%%rsi),%%ymm5,%%ymm5	\n\t	vsubpd	0x20(%%r10),%%ymm13,%%ymm13	\n\t"\
		"vmulpd	    (%%rax),%%ymm4,%%ymm4	\n\t	vmulpd	    (%%rax),%%ymm12,%%ymm12	\n\t"\
		"vmulpd	    (%%rax),%%ymm5,%%ymm5	\n\t	vmulpd	    (%%rax),%%ymm13,%%ymm13	\n\t"\
		"vaddpd	    (%%rdi),%%ymm4,%%ymm4	\n\t	vaddpd	    (%%r15),%%ymm12,%%ymm12	\n\t"\
		"vaddpd	0x20(%%rdi),%%ymm5,%%ymm5	\n\t	vaddpd	0x20(%%r15),%%ymm13,%%ymm13	\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t	vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t	vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t	vaddpd	%%ymm14,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t	vaddpd	%%ymm15,%%ymm15,%%ymm15		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t	vaddpd	%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t	vaddpd	%%ymm13,%%ymm15,%%ymm15		\n\t"\
		"vmovaps	%%ymm4,    (%%rsi)		\n\t	vmovaps	%%ymm12,    (%%r10)		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rsi)		\n\t	vmovaps	%%ymm13,0x20(%%r10)		\n\t"\
		"vmovaps	%%ymm0,%%ymm4			\n\t	vmovaps	%%ymm8 ,%%ymm12			\n\t"\
		"vmovaps	%%ymm1,%%ymm5			\n\t	vmovaps	%%ymm9 ,%%ymm13			\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t	vsubpd	%%ymm10,%%ymm8,%%ymm8 		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t	vsubpd	%%ymm11,%%ymm9,%%ymm9 		\n\t"\
		"vmulpd	0x40(%%rax),%%ymm0,%%ymm0	\n\t	vmulpd	0x40(%%rax),%%ymm8 ,%%ymm8 	\n\t"\
		"vmulpd	0x40(%%rax),%%ymm1,%%ymm1	\n\t	vmulpd	0x40(%%rax),%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	0x60(%%rax),%%ymm2,%%ymm2	\n\t	vmulpd	0x60(%%rax),%%ymm10,%%ymm10	\n\t"\
		"vmulpd	0x60(%%rax),%%ymm3,%%ymm3	\n\t	vmulpd	0x60(%%rax),%%ymm11,%%ymm11	\n\t"\
		"vmulpd	0x80(%%rax),%%ymm4,%%ymm4	\n\t	vmulpd	0x80(%%rax),%%ymm12,%%ymm12	\n\t"\
		"vmulpd	0x80(%%rax),%%ymm5,%%ymm5	\n\t	vmulpd	0x80(%%rax),%%ymm13,%%ymm13	\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t	vaddpd	%%ymm8 ,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t	vaddpd	%%ymm9 ,%%ymm11,%%ymm11		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t	vsubpd	%%ymm12,%%ymm8 ,%%ymm8 		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t	vsubpd	%%ymm13,%%ymm9 ,%%ymm9 		\n\t"\
		"vmovaps	    (%%rsi),%%ymm4		\n\t	vmovaps	    (%%r10),%%ymm12		\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm5		\n\t	vmovaps	0x20(%%r10),%%ymm13		\n\t"\
		"movq	%[__o1],%%rax				\n\t	movq	%[__u1],%%r11			\n\t"\
		"movq	%[__o4],%%rdx				\n\t	movq	%[__u4],%%r14			\n\t"\
		"vsubpd	%%ymm3,%%ymm6,%%ymm6		\n\t	vsubpd	%%ymm11,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm2,%%ymm7,%%ymm7		\n\t	vsubpd	%%ymm10,%%ymm15,%%ymm15		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t	vaddpd	%%ymm11,%%ymm11,%%ymm11		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t	vaddpd	%%ymm10,%%ymm10,%%ymm10		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)		\n\t	vmovaps	%%ymm14,    (%%r11)		\n\t"\
		"vmovaps	%%ymm7,0x20(%%rdx)		\n\t	vmovaps	%%ymm15,0x20(%%r14)		\n\t"\
		"vaddpd	%%ymm6,%%ymm3,%%ymm3		\n\t	vaddpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vaddpd	%%ymm7,%%ymm2,%%ymm2		\n\t	vaddpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vmovaps	%%ymm3,    (%%rdx)		\n\t	vmovaps	%%ymm11,    (%%r14)		\n\t"\
		"vmovaps	%%ymm2,0x20(%%rax)		\n\t	vmovaps	%%ymm10,0x20(%%r11)		\n\t"\
		"movq	%[__o2],%%rbx				\n\t	movq	%[__u2],%%r12			\n\t"\
		"movq	%[__o3],%%rcx				\n\t	movq	%[__u3],%%r13			\n\t"\
		"vsubpd	%%ymm1,%%ymm4,%%ymm4		\n\t	vsubpd	%%ymm9 ,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm0,%%ymm5,%%ymm5		\n\t	vsubpd	%%ymm8 ,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1		\n\t	vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 		\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0		\n\t	vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 		\n\t"\
		"vmovaps	%%ymm4,    (%%rbx)		\n\t	vmovaps	%%ymm12,    (%%r12)		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rcx)		\n\t	vmovaps	%%ymm13,0x20(%%r13)		\n\t"\
		"vaddpd	%%ymm4,%%ymm1,%%ymm1		\n\t	vaddpd	%%ymm12,%%ymm9,%%ymm9 		\n\t"\
		"vaddpd	%%ymm5,%%ymm0,%%ymm0		\n\t	vaddpd	%%ymm13,%%ymm8,%%ymm8 		\n\t"\
		"vmovaps	%%ymm1,    (%%rcx)		\n\t	vmovaps	%%ymm9 ,    (%%r13)		\n\t"\
		"vmovaps	%%ymm0,0x20(%%rbx)		\n\t	vmovaps	%%ymm8 ,0x20(%%r12)		\n\t"\
		:					/* outputs: none */\
		: [__cc1] "m" (Xcc1)	/* All inputs from memory addresses here */\
		 ,[__two] "m" (Xtwo)\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__j3] "m" (Xj3)\
		 ,[__j4] "m" (Xj4)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		 ,[__u3] "m" (Xu3)\
		 ,[__u4] "m" (Xu4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// AVX version has shufpd immediate = 5 = 0101_2, which is the doubled analog of the SSE2 imm8 = 1 = 01_2:
	#define PAIR_SQUARE_4_SSE2(XtAr, XtBr, XtCr, XtDr, Xc, Xs, Xforth)\
	{\
	__asm__ volatile (\
		"movq	%[__tDr]	,%%rdx		\n\t"\
		"movq	%[__tAr]	,%%rax		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm6		\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm7		\n\t"\
		"vmovaps	    (%%rax)	,%%ymm0		\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm3		\n\t"\
		"vshufpd	$5,%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vshufpd	$5,%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rax)	,%%ymm2		\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm1		\n\t"\
		"\n\t"\
		"vmulpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vmulpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1		\n\t"\
		"\n\t"\
		"movq	%[__tCr]	,%%rcx		\n\t"\
		"movq	%[__tBr]	,%%rbx		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rcx)	,%%ymm6		\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm2		\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm5		\n\t"\
		"vshufpd	$5,%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vshufpd	$5,%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm4		\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm3		\n\t"\
		"\n\t"\
		"vmulpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vmulpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmulpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rax)	,%%ymm4		\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm5		\n\t"\
		"vsubpd	%%ymm5,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm4,%%ymm5,%%ymm5		\n\t"\
		"vmulpd	%%ymm5,%%ymm4,%%ymm4		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rax)	,%%ymm5		\n\t"\
		"vmulpd	0x20(%%rax)	,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm5		,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4	,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rax)	\n\t"\
		"\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm6		\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm7		\n\t"\
		"vsubpd	%%ymm7,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm7,%%ymm7		\n\t"\
		"vmulpd	%%ymm7,%%ymm6,%%ymm6		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm7		\n\t"\
		"vmulpd	0x20(%%rbx)	,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm7		,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm6	,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rbx)	\n\t"\
		"\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm4		\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm5		\n\t"\
		"vsubpd	%%ymm5,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm4,%%ymm5,%%ymm5		\n\t"\
		"vmulpd	%%ymm5,%%ymm4,%%ymm4		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm5		\n\t"\
		"vmulpd	0x20(%%rdx)	,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm5		,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4	,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rdx)	\n\t"\
		"vshufpd	$5,%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vshufpd	$5,%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rcx)	,%%ymm6		\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm7		\n\t"\
		"vsubpd	%%ymm7,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm7,%%ymm7		\n\t"\
		"vmulpd	%%ymm7,%%ymm6,%%ymm6		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rcx)	,%%ymm7		\n\t"\
		"vmulpd	0x20(%%rcx)	,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm7		,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm6	,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rcx)	\n\t"\
		"vshufpd	$5,%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vshufpd	$5,%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"\n\t"\
		"/*** Can do in || with above segment ***/\n\t"\
		"movq	%[__c]		,%%rax		\n\t"\
		"movq	%[__s]		,%%rbx		\n\t"\
		"movq	%[__forth]	,%%rdx		\n\t"\
		"vmovaps	%%ymm0		,%%ymm4		\n\t"\
		"vmovaps	%%ymm1		,%%ymm5		\n\t"\
		"vmulpd	(%%rax)	,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	(%%rax)	,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm4	,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5	,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	(%%rbx)	,%%ymm4,%%ymm4		\n\t"\
		"vmulpd	(%%rbx)	,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm5	,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm4	,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	(%%rdx)	,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	(%%rdx)	,%%ymm1,%%ymm1		\n\t"\
		"\n\t"\
		"/*** Can do in || with above segment ***/\n\t"\
		"vmovaps	%%ymm2	,%%ymm6		\n\t"\
		"vmovaps	%%ymm3	,%%ymm7		\n\t"\
		"vmulpd	(%%rbx)	,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	(%%rbx)	,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm6	,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7	,%%ymm3,%%ymm3		\n\t"\
		"vmulpd	(%%rax)	,%%ymm6,%%ymm6		\n\t"\
		"vmulpd	(%%rax)	,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm7	,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6	,%%ymm3,%%ymm3		\n\t"\
		"vmulpd	(%%rdx)	,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	(%%rdx)	,%%ymm3,%%ymm3		\n\t"\
		"\n\t"\
		"movq	%[__tAr]	,%%rax		\n\t"\
		"movq	%[__tBr]	,%%rbx		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rax)	,%%ymm4		\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm5		\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm6		\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm3,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm4	,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm6	,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rbx)	\n\t"\
		"\n\t"\
		"movq	%[__tCr]	,%%rcx		\n\t"\
		"movq	%[__tDr]	,%%rdx		\n\t"\
		"\n\t"\
		"vshufpd	$5,%%ymm0,%%ymm0,%%ymm0		\n\t"\
		"vshufpd	$5,%%ymm1,%%ymm1,%%ymm1		\n\t"\
		"vshufpd	$5,%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vshufpd	$5,%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"\n\t"\
		"vmovaps	    (%%rdx)	,%%ymm4		\n\t"\
		"vmovaps	0x20(%%rdx)	,%%ymm5		\n\t"\
		"vmovaps	    (%%rcx)	,%%ymm6		\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm4	,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rdx)	\n\t"\
		"vmovaps	%%ymm6	,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rcx)	\n\t"\
		:					/* outputs: none */\
		: [__tAr] "m" (XtAr)	/* All inputs from memory addresses here */\
		 ,[__tBr] "m" (XtBr)\
		 ,[__tCr] "m" (XtCr)\
		 ,[__tDr] "m" (XtDr)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// Sep 2019: 2-input FFT(a)*FFT(b) version of above PAIR_SQUARE_4_SSE2 macro, based on SSE2 version of PAIR_MUL_4_SSE2.
	// NOTE: Unlike the PAIR_SQUARE_4 version of this macro, the MUL version assumes the sincos terms premultiplied by 1/4!
	// AVX version has shufpd immediate = 5 = 0101_2, which is the doubled analog of the SSE2 imm8 = 1 = 01_2:
	#define PAIR_MUL_4_SSE2(XA0,XA1,XA2,XA3, XB0,XB1,XB2,XB3, Xc,Xs,Xforth)\
	{\
	__asm__ volatile (\
		/* Load a2,a3 and b2,b3, d0,d1-swap, then compute
			t0 = ~a3r*~b3r - ~a3i*~b3i, t2 = ~a3r*~b3i + ~a3i*~b3r
			t1 = ~a2r*~b2r - ~a2i*~b2i, t3 = ~a2r*~b2i + ~a2i*~b2r
		*/\
		"movq	%[__A2]	,%%rcx	\n\t"\
		"movq	%[__A3]	,%%rdx	\n\t"\
		"movq	%[__B2]	,%%rdi	\n\t"\
		"movq	%[__B3]	,%%rsi	\n\t"\
		/* Must load double-pairs-to-be-swapped into regs first, since SHUFPD takes low double from DEST and high from SRC: */\
		"vmovaps	    (%%rcx),%%ymm0		\n\t	vshufpd	$5,%%ymm0,%%ymm0,%%ymm0	\n\t"/* ~a2r */\
		"vmovaps	0x20(%%rcx),%%ymm1		\n\t	vshufpd	$5,%%ymm1,%%ymm1,%%ymm1	\n\t"/* ~a2i */\
		"vmovaps	    (%%rdi),%%ymm4		\n\t	vshufpd	$5,%%ymm4,%%ymm4,%%ymm4	\n\t"/* ~b2r */\
		"vmovaps	0x20(%%rdi),%%ymm5		\n\t	vshufpd	$5,%%ymm5,%%ymm5,%%ymm5	\n\t"/* ~b2i */\
		"vmovaps	%%ymm0	,%%ymm8			\n\t	vmulpd	%%ymm4	,%%ymm8	,%%ymm8	\n\t"/* ~a2r*~b2r */\
		"vmovaps	%%ymm1	,%%ymm10		\n\t	vmulpd	%%ymm5	,%%ymm10,%%ymm10\n\t"/* ~a2i*~b2i */\
		"vmovaps	%%ymm5	,%%ymm11		\n\t	vmulpd	%%ymm0	,%%ymm11,%%ymm11\n\t"/* ~a2r*~b2i */\
		"vmovaps	%%ymm4	,%%ymm9			\n\t	vmulpd	%%ymm1	,%%ymm9	,%%ymm9	\n\t"/* ~a2i*~b2r */\
		"vsubpd		%%ymm10	,%%ymm8	,%%ymm8	\n\t	vaddpd	%%ymm11	,%%ymm9	,%%ymm9	\n\t"/* t1,t3 */\
		"vmovaps	    (%%rdx),%%ymm2		\n\t	vshufpd	$5,%%ymm2,%%ymm2,%%ymm2	\n\t"/* ~a3r */\
		"vmovaps	0x20(%%rdx),%%ymm3		\n\t	vshufpd	$5,%%ymm3,%%ymm3,%%ymm3	\n\t"/* ~a3i */\
		"vmovaps	    (%%rsi),%%ymm6		\n\t	vshufpd	$5,%%ymm6,%%ymm6,%%ymm6	\n\t"/* ~b3r */\
		"vmovaps	0x20(%%rsi),%%ymm7		\n\t	vshufpd	$5,%%ymm7,%%ymm7,%%ymm7	\n\t"/* ~b3i */\
		/* t1,3 not needed until final butterfly sequence, so write back to A2,3 memlocs: */\
		"vmovaps	%%ymm8	,    (%%rcx)	\n\t	movq	%[__A0]	,%%rax	\n\t"\
		"vmovaps	%%ymm9	,0x20(%%rcx)	\n\t	movq	%[__A1]	,%%rbx	\n\t"\
		"vmovaps	%%ymm2	,%%ymm8			\n\t	vmulpd	%%ymm6	,%%ymm8	,%%ymm8	\n\t"/* ~a3r*~b3r */\
		"vmovaps	%%ymm3	,%%ymm10		\n\t	vmulpd	%%ymm7	,%%ymm10,%%ymm10\n\t"/* ~a3i*~b3i */\
		"vmovaps	%%ymm7	,%%ymm11		\n\t	vmulpd	%%ymm2	,%%ymm11,%%ymm11\n\t"/* ~a3r*~b3i */\
		"vmovaps	%%ymm6	,%%ymm9			\n\t	vmulpd	%%ymm3	,%%ymm9	,%%ymm9	\n\t"/* ~a3i*~b3r */\
		"vsubpd		%%ymm10	,%%ymm8	,%%ymm8	\n\t	vaddpd	%%ymm11	,%%ymm9	,%%ymm9	\n\t"/* t0,t2 */\
		/* t0,2 not needed until final butterfly sequence, so write back to A2,3 memlocs: */\
		"vmovaps	%%ymm8	,    (%%rdx)	\n\t	movq	%[__B0]	,%%rdi	\n\t"\
		"vmovaps	%%ymm9	,0x20(%%rdx)	\n\t	movq	%[__B1]	,%%rsi	\n\t"\
	/* a2,3 in ymm0-3, b2,3 in ymm4-7, t1,3 in (rcx), t0,2 in (rdx) */\
		/* calculate difference terms...these need the [a,b][2|3] vector-data to be d0,1-swapped:
			~a3r -= a0r, ~a3i += a0i,
			~a2r -= a1r, ~a2i += a1i, similar for b-data, but move ~b2 -+ b1 down to just before a1*b1 cmul to free up 2 regs.
		*/\
/*** Need ~a3r = a0r - ~a3r, not ~a3r -= a0r! [Similar for a2r,b3r,b2r] ***
************** As currently, a2r,a3r,b2r,b3r all negated! ****************/\
		"vmovaps	    (%%rax)	,%%ymm8		\n\t	vsubpd	%%ymm8	,%%ymm2	,%%ymm2	\n\t"/* ~a3r -= a0r */\
		"vmovaps	0x20(%%rax)	,%%ymm9		\n\t	vaddpd	%%ymm9	,%%ymm3	,%%ymm3	\n\t"/* ~a3i += a0i */\
		"vmovaps	    (%%rbx)	,%%ymm10	\n\t	vsubpd	%%ymm10	,%%ymm0	,%%ymm0	\n\t"/* ~a2r -= a1r */\
		"vmovaps	0x20(%%rbx)	,%%ymm11	\n\t	vaddpd	%%ymm11	,%%ymm1	,%%ymm1	\n\t"/* ~a2i += a1i */\
		"vmovaps	    (%%rdi)	,%%ymm14	\n\t	vsubpd	%%ymm14	,%%ymm6	,%%ymm6	\n\t"/* ~b3r -= b0r */\
		"vmovaps	0x20(%%rdi)	,%%ymm15	\n\t	vaddpd	%%ymm15	,%%ymm7	,%%ymm7	\n\t"/* ~b3i += b0i */\
		/* now calculate 1st square-like term and store back in H(j) slot:
			t4 = a0r*b0r - a0i*b0i, a0i = a0r*b0i + a0i*b0r, a0r = t4
			t5 = a1r*b1r - a1i*b1i, a1i = a1r*b1i + a1i*b1r, a1r = t5
		*/\
		"vmovaps	%%ymm8	,%%ymm12		\n\t	vmulpd	%%ymm14	,%%ymm8	,%%ymm8	\n\t"/* a0r*b0r */\
		"vmovaps	%%ymm9	,%%ymm13		\n\t	vmulpd	%%ymm15	,%%ymm13,%%ymm13\n\t"/* a0i*b0i */\
		"											vmulpd	%%ymm15	,%%ymm12,%%ymm12\n\t"/* a0r*b0i */\
		"											vmulpd	%%ymm14	,%%ymm9	,%%ymm9	\n\t"/* a0i*b0r */\
		"vsubpd	%%ymm13	,%%ymm8	,%%ymm8		\n\t	vaddpd	%%ymm12	,%%ymm9	,%%ymm9	\n\t"	/* a0r,i in ymm8,9 */\
	/*** Consider overlapping these 2 cmul to better hide latency ***/\
		"vmovaps	    (%%rsi)	,%%ymm14	\n\t	vsubpd	%%ymm14	,%%ymm4	,%%ymm4	\n\t"/* ~b2r -= b1r */\
		"vmovaps	0x20(%%rsi)	,%%ymm15	\n\t	vaddpd	%%ymm15	,%%ymm5	,%%ymm5	\n\t"/* ~b2i += b1i */\
		"vmovaps	%%ymm10	,%%ymm12		\n\t	vmulpd	%%ymm14	,%%ymm10,%%ymm10\n\t"/* a1r*b1r */\
		"vmovaps	%%ymm11	,%%ymm13		\n\t	vmulpd	%%ymm15	,%%ymm13,%%ymm13\n\t"/* a1i*b1i */\
		"											vmulpd	%%ymm15	,%%ymm12,%%ymm12\n\t"/* a1r*b1i */\
		"											vmulpd	%%ymm14	,%%ymm11,%%ymm11\n\t"/* a1i*b1r */\
		"vsubpd	%%ymm13	,%%ymm10,%%ymm10	\n\t	vaddpd	%%ymm12	,%%ymm11,%%ymm11\n\t"	/* a1r,i in ymm10,11 */\
	/* a0,1 in ymm8-11, a2,3 in ymm0-3, b2,3 in ymm4-7, t1,3 in (rcx), t0,2 in (rdx) */\
		/* calculate the complex products to build the second term:
			t4 = ~a3r*~b3r - ~a3i*~b3i, ~a3i = ~a3r*~b3i + ~a3i*~b3r, ~a3r,i in ymm2,3, ~b3r,i in ymm6,7
			t5 = ~a2r*~b2r - ~a2i*~b2i, ~a2i = ~a2r*~b2i + ~a2i*~b2r, ~arr,i in ymm0,1, ~b2r,i in ymm4,5
		*/\
/****************** a2r,a3r,b2r,b3r being negated means a2i,a3i come out negated ****************/\
		"vmovaps	%%ymm2	,%%ymm12		\n\t	vmulpd	%%ymm6	,%%ymm2	,%%ymm2	\n\t"/* ~a3r*~b3r */\
		"vmovaps	%%ymm3	,%%ymm13		\n\t	vmulpd	%%ymm7	,%%ymm13,%%ymm13\n\t"/* ~a3i*~b3i */\
		"											vmulpd	%%ymm7	,%%ymm12,%%ymm12\n\t"/* ~a3r*~b3i */\
		"											vmulpd	%%ymm6	,%%ymm3	,%%ymm3	\n\t"/* ~a3i*~b3r */\
		"vsubpd	%%ymm13	,%%ymm2	,%%ymm2		\n\t	vaddpd	%%ymm12	,%%ymm3	,%%ymm3	\n\t"	/* t4,~a3i in ymm2,3 */\
	/*** Consider overlapping these 2 cmul to better hide latency ***/\
		"vmovaps	%%ymm0	,%%ymm14		\n\t	vmulpd	%%ymm4	,%%ymm0	,%%ymm0	\n\t"/* ~a2r*~b2r */\
		"vmovaps	%%ymm1	,%%ymm15		\n\t	vmulpd	%%ymm5	,%%ymm15,%%ymm15\n\t"/* ~a2i*~b2i */\
		"											vmulpd	%%ymm5	,%%ymm14,%%ymm14\n\t"/* ~a2r*~b2i */\
		"											vmulpd	%%ymm4	,%%ymm1	,%%ymm1	\n\t"/* ~a2i*~b2r */\
		"vsubpd	%%ymm15	,%%ymm0	,%%ymm0		\n\t	vaddpd	%%ymm14	,%%ymm1	,%%ymm1	\n\t"	/* t5,~a2i in ymm0,1 */\
		/* ymm4-7,12-15 free */\
		/* Assume [c0,s1],[s0,c1] sincos vector-data are in the [c] and [s]-input-pointers, then compute
			~a3r = [cc+0.25]*t4 - [ss]*~a3i, ~a3i = [ss]*t4 + [cc+0.25]*~a3i
			~a2r = [0.25-ss]*t5 - [cc]*~a2i, ~a2i = [cc]*t5 + [0.25-ss]*~a2i ,
		where cc = 0.25*[c0,s1] and ss = 0.25*[s0,c1]:
		*/\
/****************** a2i,a3i being negated requires +- sign swap in this next computation ****************/\
		"movq	%[__forth],%%rdi		\n\t	vmovaps	(%%rdi),%%ymm6		\n\t	vmovaps	%%ymm6,%%ymm7	\n\t"/* 2 copies of 0.25 */\
		"movq	%[__c]	,%%rdi			\n\t	vmovaps	(%%rdi),%%ymm4		\n\t"/*	cc assumed premultiplied by 0.25 */\
		"movq	%[__s]	,%%rsi			\n\t	vmovaps	(%%rsi),%%ymm5		\n\t"/*	ss assumed premultiplied by 0.25 */\
		"vaddpd	%%ymm4	,%%ymm6	,%%ymm6	\n\t	vsubpd	%%ymm5	,%%ymm7	,%%ymm7	\n\t"	/* [cc+0.25],[0.25-ss] in ymm6,7 */\
		"vmovaps	%%ymm2	,%%ymm12		\n\t	vmulpd	%%ymm6	,%%ymm2	,%%ymm2	\n\t"/*   t4*[cc+0.25] */\
		"vmovaps	%%ymm3	,%%ymm13		\n\t	vmulpd	%%ymm5	,%%ymm13,%%ymm13\n\t"/* ~a3i*[ss] */\
		"											vmulpd	%%ymm5	,%%ymm12,%%ymm12\n\t"/*   t4*[ss] */\
		"											vmulpd	%%ymm6	,%%ymm3	,%%ymm3	\n\t"/* ~a3i*[cc+0.25] */\
		"vaddpd	%%ymm13	,%%ymm2	,%%ymm2		\n\t	vsubpd	%%ymm12	,%%ymm3	,%%ymm3	\n\t"	/* ~a3r = [cc+0.25]*t4 - [ss]*~a3i, ~a3i = [cc+0.25]*~a3i - [ss]*t4 in ymm2,3 */\
	/*** Consider overlapping these 2 cmul to better hide latency ***/\
		"vmovaps	%%ymm0	,%%ymm14		\n\t	vmulpd	%%ymm7	,%%ymm0	,%%ymm0	\n\t"/*   t5*[0.25-ss] */\
		"vmovaps	%%ymm1	,%%ymm15		\n\t	vmulpd	%%ymm4	,%%ymm15,%%ymm15\n\t"/* ~a2i*[cc] */\
		"											vmulpd	%%ymm4	,%%ymm14,%%ymm14\n\t"/*   t5*[cc] */\
		"											vmulpd	%%ymm7	,%%ymm1	,%%ymm1	\n\t"/* ~a2i*[0.25-ss] */\
		"vaddpd	%%ymm15	,%%ymm0	,%%ymm0		\n\t	vsubpd	%%ymm14	,%%ymm1	,%%ymm1	\n\t"	/* ~a2r = [0.25-ss]*t5 - [cc]*~a2i, ~a2i = [0.25-ss]*~a2i - [cc]*t5 in ymm0,1 */\
/****************** a2i,a3i negated ****************/\
	/* a0,1 in ymm8-11, a2,3 in ymm0-3, t1,3 in (rcx), t0,2 in (rdx) */\
		"vmovaps	    (%%rdx)	,%%ymm4		\n\t"/* t0 */\
		"vmovaps	0x20(%%rdx)	,%%ymm5		\n\t"/* t2 */\
		"vmovaps	    (%%rcx)	,%%ymm6		\n\t"/* t1 */\
		"vmovaps	0x20(%%rcx)	,%%ymm7		\n\t"/* t3 */\
	/* and now complete and store the results:
		a0r -= ~a3r, a0i -= ~a3i
		a1r -= ~a2r, a1i -= ~a2i
	N-j terms:
		~a3r = t0 - ~a3r, ~a3i += t2
		~a2r = t1 - ~a2r, ~a2i += t3
	*/\
/****************** a2i,a3i negated means in rcol instead computing a0,1i += ~a3,2i, a3,2i = t2,3 - a3,2i ****************/\
		"vsubpd	%%ymm2	,%%ymm8	,%%ymm8	\n\t	vaddpd	%%ymm3	,%%ymm9	,%%ymm9	\n\t"	/* a0r,i in ymm8,9 */\
		"vsubpd	%%ymm0	,%%ymm10,%%ymm10\n\t	vaddpd	%%ymm1	,%%ymm11,%%ymm11\n\t"	/* a1r,i in ymm10,11 */\
		"vsubpd	%%ymm2	,%%ymm4	,%%ymm4	\n\t	vsubpd	%%ymm3	,%%ymm5	,%%ymm5	\n\t"	/* ~a3r,i in ymm4,5 */\
		"vsubpd	%%ymm0	,%%ymm6	,%%ymm6	\n\t	vsubpd	%%ymm1	,%%ymm7	,%%ymm7	\n\t"	/* ~a2r,i in ymm6,7 */\
	/* Interleave writes of a0,a1 with un-shufflings of ~a2,~a3: */\
		"vmovaps	%%ymm8	,    (%%rax)	\n\t	vshufpd	$5	,%%ymm4	,%%ymm4,%%ymm4	\n\t"/* ~a3r */\
		"vmovaps	%%ymm9	,0x20(%%rax)	\n\t	vshufpd	$5	,%%ymm5	,%%ymm5,%%ymm5	\n\t"/* ~a3i */\
		"vmovaps	%%ymm10	,    (%%rbx)	\n\t	vshufpd	$5	,%%ymm6	,%%ymm6,%%ymm6	\n\t"/* ~a2r */\
		"vmovaps	%%ymm11	,0x20(%%rbx)	\n\t	vshufpd	$5	,%%ymm7	,%%ymm7,%%ymm7	\n\t"/* ~a2i */\
		"vmovaps	%%ymm4	,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm5	,0x20(%%rdx)	\n\t"\
		"vmovaps	%%ymm6	,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7	,0x20(%%rcx)	\n\t"\
		/* Cost: [35 vector-load/store (0 implicit), 12 shufpd, 34 addpd, 32 mulpd, 21 vector-register-copy] */\
		:					/* outputs: none */\
		: [__A0] "m" (XA0)	/* All inputs from memory addresses here */\
		 ,[__A1] "m" (XA1)\
		 ,[__A2] "m" (XA2)\
		 ,[__A3] "m" (XA3)\
		 ,[__B0] "m" (XB0)\
		 ,[__B1] "m" (XB1)\
		 ,[__B2] "m" (XB2)\
		 ,[__B3] "m" (XB3)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	/*...Radix-7 DFT: Inputs in memlocs __i0-6, outputs into __o0-6, possibly coincident with inputs:\ */\
	// AVX version: [54 ADD, 34 SUB, 16 MUL, 54 memref]
	//
	#define SSE2_RADIX_07_DFT(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6, Xcc, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6)\
	{\
	__asm__ volatile (\
		"movq	%[__i1],%%rax		\n\t"\
		"movq	%[__i2],%%rbx		\n\t"\
		"movq	%[__i3],%%rcx		\n\t"\
		"movq	%[__i4],%%rdx		\n\t"\
		"movq	%[__i5],%%rsi		\n\t"\
		"movq	%[__i6],%%rdi		\n\t	/*** Imaginary Parts: ***/	\n\t"\
		"vmovaps	(%%rax),%%ymm6		\n\t	vmovaps	0x20(%%rax),%%ymm14	\n\t"\
		"vmovaps	(%%rdi),%%ymm1		\n\t	vmovaps	0x20(%%rdi),%%ymm9 	\n\t"\
		"vmovaps	(%%rbx),%%ymm5		\n\t	vmovaps	0x20(%%rbx),%%ymm13	\n\t"\
		"vmovaps	(%%rsi),%%ymm2		\n\t	vmovaps	0x20(%%rsi),%%ymm10	\n\t"\
		"vmovaps	(%%rcx),%%ymm4		\n\t	vmovaps	0x20(%%rcx),%%ymm12	\n\t"\
		"vmovaps	(%%rdx),%%ymm3		\n\t	vmovaps	0x20(%%rdx),%%ymm11	\n\t"\
		"movq	%[__i0],%%rbx		\n\t"\
		"vsubpd	%%ymm1,%%ymm6,%%ymm6	\n\t	vsubpd	%%ymm9 ,%%ymm14,%%ymm14		\n\t"\
		"vsubpd	%%ymm2,%%ymm5,%%ymm5	\n\t	vsubpd	%%ymm10,%%ymm13,%%ymm13		\n\t"\
		"vsubpd	%%ymm3,%%ymm4,%%ymm4	\n\t	vsubpd	%%ymm11,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1	\n\t	vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2	\n\t	vaddpd	%%ymm10,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3	\n\t	vaddpd	%%ymm11,%%ymm11,%%ymm11		\n\t"\
		"vaddpd	%%ymm6,%%ymm1,%%ymm1	\n\t	vaddpd	%%ymm14,%%ymm9 ,%%ymm9  	\n\t"\
		"vaddpd	%%ymm5,%%ymm2,%%ymm2	\n\t	vaddpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm4,%%ymm3,%%ymm3	\n\t	vaddpd	%%ymm12,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	(%%rbx),%%ymm0		\n\t	vmovaps	0x20(%%rbx),%%ymm8 	\n\t"\
		"\n\t"\
		"movq	%[__o0],%%rcx		\n\t"\
		"movq	%[__cc],%%rsi		\n\t"\
		"vmovaps	%%ymm0,0x100(%%rsi)	\n\t	vmovaps	%%ymm8 ,0x140(%%rsi)	\n\t"\
		"vmovaps	%%ymm6,0x120(%%rsi)	\n\t	vmovaps	%%ymm14,0x160(%%rsi)	\n\t"\
		"vaddpd	%%ymm1,%%ymm0,%%ymm0	\n\t	vaddpd	%%ymm9 ,%%ymm8 ,%%ymm8 	\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t	vmovaps	%%ymm13,%%ymm15		\n\t"\
		"vaddpd	%%ymm2,%%ymm3,%%ymm3	\n\t	vaddpd	%%ymm10,%%ymm11,%%ymm11		\n\t"\
		"vsubpd	%%ymm4,%%ymm5,%%ymm5	\n\t	vsubpd	%%ymm12,%%ymm13,%%ymm13		\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1	\n\t	vsubpd	%%ymm10,%%ymm9 ,%%ymm9  	\n\t"\
		"vsubpd	%%ymm7,%%ymm6,%%ymm6	\n\t	vsubpd	%%ymm15,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2	\n\t	vaddpd	%%ymm10,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4	\n\t	vaddpd	%%ymm15,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0	\n\t	vaddpd	%%ymm11,%%ymm8 ,%%ymm8  	\n\t"\
	"vaddpd	0x120(%%rsi),%%ymm5,%%ymm5	\n\t	vaddpd	0x160(%%rsi),%%ymm13,%%ymm13\n\t"\
		"vsubpd	%%ymm2,%%ymm3,%%ymm3	\n\t	vsubpd	%%ymm10,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	%%ymm4,%%ymm7		\n\t	vmovaps	%%ymm12,%%ymm15		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t	vmovaps	%%ymm8 ,0x20(%%rcx)	\n\t"/* B0 */\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4	\n\t	vsubpd		%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vmovaps	%%ymm1,%%ymm2		\n\t	vmovaps	%%ymm9 ,%%ymm10		\n\t"\
	"vsubpd 0x100(%%rsi),%%ymm0,%%ymm0	\n\t	vsubpd 0x140(%%rsi),%%ymm8 ,%%ymm8  \n\t"\
	"vmulpd	0x20(%%rsi),%%ymm5,%%ymm5	\n\t	vmulpd	0x20(%%rsi),%%ymm13,%%ymm13	\n\t"\
	"vaddpd	%%ymm3,%%ymm2,%%ymm2		\n\t	vaddpd	%%ymm11,%%ymm10,%%ymm10		\n\t"\
	"vmulpd	0x80(%%rsi),%%ymm3,%%ymm3	\n\t	vmulpd	0x80(%%rsi),%%ymm11,%%ymm11	\n\t"\
	"vmulpd	0xe0(%%rsi),%%ymm4,%%ymm4	\n\t	vmulpd	0xe0(%%rsi),%%ymm12,%%ymm12	\n\t"\
	"vmulpd	0x40(%%rsi),%%ymm1,%%ymm1	\n\t	vmulpd	0x40(%%rsi),%%ymm9 ,%%ymm9  \n\t"\
	"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t	vmulpd	0x60(%%rsi),%%ymm14,%%ymm14	\n\t"\
	"vmulpd	    (%%rsi),%%ymm0,%%ymm0	\n\t	vmulpd	    (%%rsi),%%ymm8 ,%%ymm8  \n\t"\
	"vmulpd	0xa0(%%rsi),%%ymm7,%%ymm7	\n\t	vmulpd	0xa0(%%rsi),%%ymm15,%%ymm15	\n\t"\
	"vmulpd	0xc0(%%rsi),%%ymm2,%%ymm2	\n\t	vmulpd	0xc0(%%rsi),%%ymm10,%%ymm10	\n\t"\
	"vaddpd	    (%%rcx),%%ymm0,%%ymm0	\n\t	vaddpd	0x20(%%rcx),%%ymm8 ,%%ymm8  \n\t"\
	"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t	vaddpd	%%ymm12,%%ymm14,%%ymm14		\n\t"\
	"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t	vsubpd	%%ymm10,%%ymm9 ,%%ymm9  	\n\t"\
	"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t	vsubpd	%%ymm15,%%ymm12,%%ymm12		\n\t"\
	"vsubpd	%%ymm2,%%ymm3,%%ymm3		\n\t	vsubpd	%%ymm10,%%ymm11,%%ymm11		\n\t"\
		"movq	%[__o1],%%rax		\n\t"\
		"movq	%[__o2],%%rbx		\n\t"\
		"movq	%[__o3],%%rcx		\n\t"\
		"movq	%[__o4],%%rdx		\n\t"\
		"movq	%[__o5],%%rsi		\n\t"\
		"movq	%[__o6],%%rdi		\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t	vmovaps	%%ymm8 ,%%ymm10		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t	vmovaps	%%ymm13,%%ymm15		\n\t"\
		"vaddpd	%%ymm1,%%ymm0,%%ymm0	\n\t	vaddpd	%%ymm9 ,%%ymm8 ,%%ymm8  	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5	\n\t	vaddpd	%%ymm14,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1	\n\t	vaddpd	%%ymm11,%%ymm9 ,%%ymm9  	\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6	\n\t	vaddpd	%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm2,%%ymm3,%%ymm3	\n\t	vaddpd	%%ymm10,%%ymm11,%%ymm11		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4	\n\t	vaddpd	%%ymm15,%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm1,%%ymm2,%%ymm2	\n\t	vsubpd	%%ymm9 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm6,%%ymm7,%%ymm7	\n\t	vsubpd	%%ymm14,%%ymm15,%%ymm15		\n\t"\
		/* ymm1,6,9,14 free ... Note the order reversal on the 3rd pair of outputs: */\
		"vsubpd	%%ymm13,%%ymm0 ,%%ymm0 	\n\t	vsubpd	%%ymm15,%%ymm2 ,%%ymm2 		\n\t	vsubpd	%%ymm12,%%ymm3 ,%%ymm3 		\n\t"\
		"vsubpd	%%ymm5 ,%%ymm8 ,%%ymm8  \n\t	vsubpd	%%ymm7 ,%%ymm10,%%ymm10		\n\t	vsubpd	%%ymm4 ,%%ymm11,%%ymm11		\n\t"\
		"vaddpd	%%ymm13,%%ymm13,%%ymm13	\n\t	vaddpd	%%ymm15,%%ymm15,%%ymm15		\n\t	vaddpd	%%ymm12,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm5 ,%%ymm5 ,%%ymm5 	\n\t	vaddpd	%%ymm7 ,%%ymm7 ,%%ymm7 		\n\t	vaddpd	%%ymm4 ,%%ymm4 ,%%ymm4 		\n\t"\
		"vaddpd	%%ymm0 ,%%ymm13,%%ymm13	\n\t	vaddpd	%%ymm2 ,%%ymm15,%%ymm15		\n\t	vaddpd	%%ymm3 ,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm8 ,%%ymm5 ,%%ymm5 	\n\t	vaddpd	%%ymm10,%%ymm7 ,%%ymm7 		\n\t	vaddpd	%%ymm11,%%ymm4 ,%%ymm4 		\n\t"\
		"vmovaps	%%ymm0 ,    (%%rax)	\n\t	vmovaps	%%ymm2 ,    (%%rbx)	\n\t	vmovaps	%%ymm3 ,    (%%rdx)	\n\t"/* B124r */\
		"vmovaps	%%ymm8 ,0x20(%%rdi)	\n\t	vmovaps	%%ymm10,0x20(%%rsi)	\n\t	vmovaps	%%ymm11,0x20(%%rcx)	\n\t"/* B653i */\
		"vmovaps	%%ymm13,    (%%rdi)	\n\t	vmovaps	%%ymm15,    (%%rsi)	\n\t	vmovaps	%%ymm12,    (%%rcx)	\n\t"/* B653r */\
		"vmovaps	%%ymm5 ,0x20(%%rax)	\n\t	vmovaps	%%ymm7 ,0x20(%%rbx)	\n\t	vmovaps	%%ymm4 ,0x20(%%rdx)	\n\t"/* B124i */\
		"										\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__cc] "m" (Xcc)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	/* Twiddleless version of SSE2_RADIX8_DIF_TWIDDLE. Inputs enter in memory locations __r0 + [__i1,__i2,__i3,__i4,__i5,__i6,__i7],;
	where r0 is a memory address and the i's are LITERAL [BYTE] OFFSETS. Outputs go into memory locations __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7, assumed disjoint with inputs:\
	*/
	#define SSE2_RADIX8_DIF_0TWIDDLE(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
	{\
	__asm__ volatile (\
		"/* 1st of 2 radix-4 subtransforms, data in ymm0-7: */\n\t	/* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\n\t"\
		"movq	%[__r0],%%rax	/* i0 = r00 */	\n\t		leaq	%c[__i1](%%rax),%%r10	/* i1 */\n\t"\
		"leaq	%c[__i2](%%rax),%%rbx			\n\t		leaq	%c[__i3](%%rax),%%r11	/* i3 */\n\t"\
		"leaq	%c[__i4](%%rax),%%rcx			\n\t		leaq	%c[__i5](%%rax),%%r12	/* i5 */\n\t"\
		"leaq	%c[__i6](%%rax),%%rdx			\n\t		leaq	%c[__i7](%%rax),%%r13	/* i7 */\n\t"\
		"movq	%[__isrt2],%%rsi				\n\t		/* p1,5 combo: x+y into ymm8 /1, x-y in ymm10/3: */	\n\t"\
		"/* p0,4 combo: x+-y into ymm0/1, 2/3, resp: */\n\t	vmovaps	     (%%r12),%%ymm8 			\n\t"\
		"										\n\t		vmovaps	0x020(%%r12),%%ymm9 			\n\t"\
		"vmovaps	     (%%rcx),%%ymm0			\n\t		vmovaps	     (%%r10),%%ymm10			\n\t"\
		"vmovaps	0x020(%%rcx),%%ymm1			\n\t		vmovaps	0x020(%%r10),%%ymm11			\n\t"\
		"vmovaps	     (%%rax),%%ymm2			\n\t		vsubpd	%%ymm8 ,%%ymm10,%%ymm10			\n\t"\
		"vmovaps	0x020(%%rax),%%ymm3			\n\t		vsubpd	%%ymm9 ,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2			\n\t		vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 			\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3			\n\t		vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0			\n\t		vaddpd	%%ymm10,%%ymm8 ,%%ymm8 			\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1			\n\t		vaddpd	%%ymm11,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0			\n\t		/* p3,7 combo: x+y into ymm14/7, x-y in ymm12/5: */	\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1			\n\t		vmovaps	     (%%r11),%%ymm12			\n\t"\
		"										\n\t		vmovaps	0x020(%%r11),%%ymm13			\n\t"\
		"/* p2,6 combo: x+-y into ymm4/5, 6/7, resp: */\n\t	vmovaps	     (%%r13),%%ymm14			\n\t"\
		"										\n\t		vmovaps	0x020(%%r13),%%ymm15			\n\t"\
		"vmovaps	     (%%rdx),%%ymm4			\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vmovaps	0x020(%%rdx),%%ymm5			\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13			\n\t"\
		"vmovaps	     (%%rbx),%%ymm6			\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t"\
		"vmovaps	0x020(%%rbx),%%ymm7			\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6			\n\t		vaddpd	%%ymm12,%%ymm14,%%ymm14			\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7			\n\t		vaddpd	%%ymm13,%%ymm15,%%ymm15			\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t		/* Finish radix-4 butterfly, tmp-store 1st of 4 outputs to free up 2 registers: */\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t		vsubpd	%%ymm14,%%ymm8 ,%%ymm8 			\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4			\n\t		vsubpd	%%ymm15,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5			\n\t		vsubpd	%%ymm13,%%ymm10,%%ymm10			\n\t"\
		"										\n\t		vsubpd	%%ymm12,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2			\n\t		vaddpd	%%ymm13,%%ymm13,%%ymm13			\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1			\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3			\n\t		vaddpd	%%ymm12,%%ymm12,%%ymm12			\n\t"\
		"													vaddpd	%%ymm8 ,%%ymm14,%%ymm14			\n\t"\
		"													vaddpd	%%ymm10,%%ymm13,%%ymm13			\n\t"\
		"													vaddpd	%%ymm9 ,%%ymm15,%%ymm15			\n\t"\
		"													vaddpd	%%ymm11,%%ymm12,%%ymm12			\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t		vmovaps	%%ymm14,     (%%r10)			\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7			\n\t		vmovaps	%%ymm15,0x020(%%r10)			\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t		vmovaps	%%ymm10,%%ymm14					\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6			\n\t		vmovaps	%%ymm13,%%ymm15					\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4			\n\t		vsubpd	%%ymm12,%%ymm10,%%ymm10			\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7			\n\t		vsubpd	%%ymm11,%%ymm13,%%ymm13			\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5			\n\t		vaddpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6			\n\t		vaddpd	%%ymm15,%%ymm11,%%ymm11			\n\t"\
		"													vmovaps	(%%rsi),%%ymm14	/* isrt2 */		\n\t"\
		"													vmulpd	%%ymm14,%%ymm10,%%ymm10			\n\t"\
		"													vmulpd	%%ymm14,%%ymm13,%%ymm13			\n\t"\
		"													vmulpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"										\n\t		vmulpd	%%ymm14,%%ymm11,%%ymm11			\n\t"\
		"vmovaps      (%%r10),%%ymm14 /* reload spill */\n\t"\
		"vmovaps 0x020(%%r10),%%ymm15 /* reload spill */\n\t"\
		"										\n\t"\
		"/* Inline of SSE2_RADIX8_DIF_COMBINE_RAD4_SUBS_A(r0): Combine radix-4 subtransforms and write outputs: */\n\t"\
		"/***** t0,1,2,3,4,5,6,7 in ymm[ 4, 5| 2,6| 0, 1| 7,3] *****/\n\t"\
		"/***** t8,9,a,b,c,d,e,f in ymm[14,15|10,12| 8, 9|13,11] */\n\t"\
		"movq	%[__o4],%%rax					\n\t		vsubpd   %%ymm10,%%ymm2	,%%ymm2			\n\t"\
		"movq	%[__o5],%%rbx					\n\t		vsubpd   %%ymm12,%%ymm6	,%%ymm6			\n\t"\
		"movq	%[__o6],%%rcx					\n\t		vaddpd   %%ymm10,%%ymm10,%%ymm10		\n\t"\
		"movq	%[__o7],%%rdx					\n\t		vaddpd   %%ymm12,%%ymm12,%%ymm12		\n\t"\
		"										\n\t		vaddpd   %%ymm2 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd   %%ymm11,%%ymm7 ,%%ymm7 		\n\t		vaddpd   %%ymm6 ,%%ymm12,%%ymm12		\n\t"\
		"vsubpd   %%ymm13,%%ymm3 ,%%ymm3 		\n\t"\
		"vaddpd   %%ymm11,%%ymm11,%%ymm11		\n\t		vmovaps	%%ymm2 ,     (%%rbx)	/* o5r */	\n\t"\
		"vaddpd   %%ymm13,%%ymm13,%%ymm13		\n\t		vmovaps	%%ymm6 ,0x020(%%rbx)	/* o5i */	\n\t"\
		"vaddpd   %%ymm7 ,%%ymm11,%%ymm11		\n\t		vmovaps	%%ymm10,     (%%rax)	/* o4r */	\n\t"\
		"vaddpd   %%ymm3 ,%%ymm13,%%ymm13		\n\t		vmovaps	%%ymm12,0x020(%%rax)	/* o4i */	\n\t"\
		"										\n\t"\
		"vmovaps	%%ymm7 ,     (%%rcx)	/* o6r */\n\t"\
		"vmovaps	%%ymm3 ,0x020(%%rdx)	/* o7i */\n\t"\
		"vmovaps	%%ymm11,    (%%rdx)	/* o7r */	\n\t"\
		"vmovaps	%%ymm13,0x020(%%rcx)	/* o6i */\n\t"\
		"										\n\t"\
		"movq	%[__o0],%%rax					\n\t"\
		"movq	%[__o1],%%rbx					\n\t"\
		"movq	%[__o2],%%rcx					\n\t"\
		"movq	%[__o3],%%rdx					\n\t"\
		"										\n\t"\
		"vsubpd	%%ymm14,%%ymm4 ,%%ymm4 			\n\t"\
		"vsubpd	%%ymm15,%%ymm5 ,%%ymm5 			\n\t"\
		"vsubpd	%%ymm9 ,%%ymm0 ,%%ymm0 			\n\t"\
		"vsubpd	%%ymm8 ,%%ymm1 ,%%ymm1 			\n\t"\
		"vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t		vmovaps	%%ymm4 ,     (%%rbx)	/* o1r */	\n\t"\
		"vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t		vmovaps	%%ymm5 ,0x020(%%rbx)	/* o1i */	\n\t"\
		"vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 			\n\t		vmovaps	%%ymm0 ,     (%%rcx)	/* o2r */	\n\t"\
		"vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 			\n\t		vmovaps	%%ymm1 ,0x020(%%rdx)	/* o3i */	\n\t"\
		"vaddpd	%%ymm4 ,%%ymm14,%%ymm14			\n\t"\
		"vaddpd	%%ymm5 ,%%ymm15,%%ymm15			\n\t"\
		"vaddpd	%%ymm0 ,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm1 ,%%ymm8 ,%%ymm8 			\n\t"\
		"										\n\t"\
		"vmovaps	%%ymm14,    (%%rax)	/* o0r */	\n\t"\
		"vmovaps	%%ymm15,0x020(%%rax)	/* o0r */\n\t"\
		"vmovaps	%%ymm9 ,     (%%rdx)	/* o3r */\n\t"\
		"vmovaps	%%ymm8 ,0x020(%%rcx)	/* o2i */\n\t"\
		"										\n\t"\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__i5] "e" (Xi5)\
		 ,[__i6] "e" (Xi6)\
		 ,[__i7] "e" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Need a 2nd version of above which takes the i-strides as intvars rather than literal bytes:
	#define SSE2_RADIX8_DIF_0TWIDDLE_B(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
	{\
	__asm__ volatile (\
		"/* 1st of 2 radix-4 subtransforms, data in ymm0-7: */\n\t	/* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\n\t"\
		"movq	%[__r0],%%rax	/* i0 = r00 */	\n\t		movslq	%[__i1],%%r10		/* i1 */	\n\t"\
		"movslq	%[__i2],%%rbx	/* i2 */		\n\t		movslq	%[__i3],%%r11		/* i3 */	\n\t"\
		"movslq	%[__i4],%%rcx	/* i4 */		\n\t		movslq	%[__i5],%%r12		/* i5 */	\n\t"\
		"movslq	%[__i6],%%rdx	/* i6 */		\n\t		movslq	%[__i7],%%r13		/* i7 */	\n\t"\
		"addq	%%rax,%%rbx						\n\t		addq	%%rax,%%r10						\n\t"\
		"addq	%%rax,%%rcx						\n\t		addq	%%rax,%%r11						\n\t"\
		"addq	%%rax,%%rdx						\n\t		addq	%%rax,%%r12						\n\t"\
		"movq	%[__isrt2],%%rsi				\n\t		addq	%%rax,%%r13						\n\t"\
		"										\n\t		/* p1,5 combo: x+y into ymm8 /1, x-y in ymm10/3: */	\n\t"\
		"/* p0,4 combo: x+-y into ymm0/1, 2/3, resp: */\n\t	vmovaps	     (%%r12),%%ymm8 			\n\t"\
		"										\n\t		vmovaps	0x020(%%r12),%%ymm9 			\n\t"\
		"vmovaps	     (%%rcx),%%ymm0			\n\t		vmovaps	     (%%r10),%%ymm10			\n\t"\
		"vmovaps	0x020(%%rcx),%%ymm1			\n\t		vmovaps	0x020(%%r10),%%ymm11			\n\t"\
		"vmovaps	     (%%rax),%%ymm2			\n\t		vsubpd	%%ymm8 ,%%ymm10,%%ymm10			\n\t"\
		"vmovaps	0x020(%%rax),%%ymm3			\n\t		vsubpd	%%ymm9 ,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2			\n\t		vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 			\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3			\n\t		vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0			\n\t		vaddpd	%%ymm10,%%ymm8 ,%%ymm8 			\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1			\n\t		vaddpd	%%ymm11,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0			\n\t		/* p3,7 combo: x+y into ymm14/7, x-y in ymm12/5: */	\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1			\n\t		vmovaps	     (%%r11),%%ymm12			\n\t"\
		"										\n\t		vmovaps	0x020(%%r11),%%ymm13			\n\t"\
		"/* p2,6 combo: x+-y into ymm4/5, 6/7, resp: */\n\t	vmovaps	     (%%r13),%%ymm14			\n\t"\
		"										\n\t		vmovaps	0x020(%%r13),%%ymm15			\n\t"\
		"vmovaps	     (%%rdx),%%ymm4			\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vmovaps	0x020(%%rdx),%%ymm5			\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13			\n\t"\
		"vmovaps	     (%%rbx),%%ymm6			\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t"\
		"vmovaps	0x020(%%rbx),%%ymm7			\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6			\n\t		vaddpd	%%ymm12,%%ymm14,%%ymm14			\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7			\n\t		vaddpd	%%ymm13,%%ymm15,%%ymm15			\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t		/* Finish radix-4 butterfly, tmp-store 1st of 4 outputs to free up 2 registers: */\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t		vsubpd	%%ymm14,%%ymm8 ,%%ymm8 			\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4			\n\t		vsubpd	%%ymm15,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5			\n\t		vsubpd	%%ymm13,%%ymm10,%%ymm10			\n\t"\
		"										\n\t		vsubpd	%%ymm12,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2			\n\t		vaddpd	%%ymm13,%%ymm13,%%ymm13			\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1			\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3			\n\t		vaddpd	%%ymm12,%%ymm12,%%ymm12			\n\t"\
		"													vaddpd	%%ymm8 ,%%ymm14,%%ymm14			\n\t"\
		"													vaddpd	%%ymm10,%%ymm13,%%ymm13			\n\t"\
		"													vaddpd	%%ymm9 ,%%ymm15,%%ymm15			\n\t"\
		"													vaddpd	%%ymm11,%%ymm12,%%ymm12			\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t		vmovaps	%%ymm14,     (%%r10)			\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7			\n\t		vmovaps	%%ymm15,0x020(%%r10)			\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t		vmovaps	%%ymm10,%%ymm14					\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6			\n\t		vmovaps	%%ymm13,%%ymm15					\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4			\n\t		vsubpd	%%ymm12,%%ymm10,%%ymm10			\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7			\n\t		vsubpd	%%ymm11,%%ymm13,%%ymm13			\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5			\n\t		vaddpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6			\n\t		vaddpd	%%ymm15,%%ymm11,%%ymm11			\n\t"\
		"													vmovaps	(%%rsi),%%ymm14	/* isrt2 */		\n\t"\
		"													vmulpd	%%ymm14,%%ymm10,%%ymm10			\n\t"\
		"													vmulpd	%%ymm14,%%ymm13,%%ymm13			\n\t"\
		"													vmulpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"										\n\t		vmulpd	%%ymm14,%%ymm11,%%ymm11			\n\t"\
		"vmovaps      (%%r10),%%ymm14 /* reload spill */\n\t"\
		"vmovaps 0x020(%%r10),%%ymm15 /* reload spill */\n\t"\
		"										\n\t"\
		"/* Inline of SSE2_RADIX8_DIF_COMBINE_RAD4_SUBS_A(r0): Combine radix-4 subtransforms and write outputs: */\n\t"\
		"/***** t0,1,2,3,4,5,6,7 in ymm[ 4, 5| 2,6| 0, 1| 7,3] *****/\n\t"\
		"/***** t8,9,a,b,c,d,e,f in ymm[14,15|10,12| 8, 9|13,11] */\n\t"\
		"movq	%[__o4],%%rax					\n\t		vsubpd   %%ymm10,%%ymm2	,%%ymm2			\n\t"\
		"movq	%[__o5],%%rbx					\n\t		vsubpd   %%ymm12,%%ymm6	,%%ymm6			\n\t"\
		"movq	%[__o6],%%rcx					\n\t		vaddpd   %%ymm10,%%ymm10,%%ymm10		\n\t"\
		"movq	%[__o7],%%rdx					\n\t		vaddpd   %%ymm12,%%ymm12,%%ymm12		\n\t"\
		"										\n\t		vaddpd   %%ymm2 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd   %%ymm11,%%ymm7 ,%%ymm7 		\n\t		vaddpd   %%ymm6 ,%%ymm12,%%ymm12		\n\t"\
		"vsubpd   %%ymm13,%%ymm3 ,%%ymm3 		\n\t"\
		"vaddpd   %%ymm11,%%ymm11,%%ymm11		\n\t		vmovaps	%%ymm2 ,     (%%rbx)	/* o5r */	\n\t"\
		"vaddpd   %%ymm13,%%ymm13,%%ymm13		\n\t		vmovaps	%%ymm6 ,0x020(%%rbx)	/* o5i */	\n\t"\
		"vaddpd   %%ymm7 ,%%ymm11,%%ymm11		\n\t		vmovaps	%%ymm10,     (%%rax)	/* o4r */	\n\t"\
		"vaddpd   %%ymm3 ,%%ymm13,%%ymm13		\n\t		vmovaps	%%ymm12,0x020(%%rax)	/* o4i */	\n\t"\
		"										\n\t"\
		"vmovaps	%%ymm7 ,     (%%rcx)	/* o6r */\n\t"\
		"vmovaps	%%ymm3 ,0x020(%%rdx)	/* o7i */\n\t"\
		"vmovaps	%%ymm11,     (%%rdx)	/* o7r */\n\t"\
		"vmovaps	%%ymm13,0x020(%%rcx)	/* o6i */\n\t"\
		"										\n\t"\
		"movq	%[__o0],%%rax					\n\t"\
		"movq	%[__o1],%%rbx					\n\t"\
		"movq	%[__o2],%%rcx					\n\t"\
		"movq	%[__o3],%%rdx					\n\t"\
		"										\n\t"\
		"vsubpd	%%ymm14,%%ymm4 ,%%ymm4 			\n\t"\
		"vsubpd	%%ymm15,%%ymm5 ,%%ymm5 			\n\t"\
		"vsubpd	%%ymm9 ,%%ymm0 ,%%ymm0 			\n\t"\
		"vsubpd	%%ymm8 ,%%ymm1 ,%%ymm1 			\n\t"\
		"vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t		vmovaps	%%ymm4 ,     (%%rbx)	/* o1r */	\n\t"\
		"vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t		vmovaps	%%ymm5 ,0x020(%%rbx)	/* o1i */	\n\t"\
		"vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 			\n\t		vmovaps	%%ymm0 ,     (%%rcx)	/* o2r */	\n\t"\
		"vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 			\n\t		vmovaps	%%ymm1 ,0x020(%%rdx)	/* o3i */	\n\t"\
		"vaddpd	%%ymm4 ,%%ymm14,%%ymm14			\n\t"\
		"vaddpd	%%ymm5 ,%%ymm15,%%ymm15			\n\t"\
		"vaddpd	%%ymm0 ,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm1 ,%%ymm8 ,%%ymm8 			\n\t"\
		"										\n\t"\
		"vmovaps	%%ymm14,     (%%rax)	/* o0r */\n\t"\
		"vmovaps	%%ymm15,0x020(%%rax)	/* o0r */\n\t"\
		"vmovaps	%%ymm9 ,     (%%rdx)	/* o3r */\n\t"\
		"vmovaps	%%ymm8 ,0x020(%%rcx)	/* o2i */\n\t"\
		"										\n\t"\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// AVX analog of dft_macro.h::RADIX_08_DIF_TWIDDLE_OOP - Result of adding separate I/O addressing to
	// radix8_dif_dit_pass_gcc64.h::SSE2_RADIX8_DIF_TWIDDLE:
	#define SSE2_RADIX8_DIF_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xoff, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
	/* i1 is base byte-offset, no need to lshift it prior to add: */\
		"xorq	%%r8,%%r8	\n\t	leaq	%c[i1](%%r8),%%r8	\n\t"/* movq|movslq of literal %c[i1] both segfaulted, workaround via LEA */\
		"movq	%[in0],%%rax		\n\t	leaq	(%%rax,%%r8,4),%%r10	\n\t"/* [lcol,rcol] base-addresses = in0 + [0,4*istride] */\
		"movq	%[twid_ptrs],%%rsi	\n\t			leaq	(%%r10,%%r8  ),%%r11		\n\t"\
		/* The twid_ptrs[] array holds ptrs to 14 complex twiddles in BR order: (c,s)[4,2,6,1,5,3,7]: */\
		"				movq	0x30(%%rsi),%%r12	\n\t	movq	0x40(%%rsi),%%r14	\n\t"/* c1,c5 */\
		"				movq	0x38(%%rsi),%%r13	\n\t	movq	0x48(%%rsi),%%r15	\n\t"/* s1,s5 */\
		"											vmovaps	    (%%r10)	,%%ymm8 	\n\t"\
		"leaq	(%%rax,%%r8),%%rbx		\n\t		vmovaps	0x20(%%r10)	,%%ymm10	\n\t"\
		"movq	    (%%rsi),%%rcx	\n\t"/* c4 */"	vmovaps		%%ymm8 ,%%ymm9 	\n\t"\
		"movq	0x08(%%rsi),%%rsi	\n\t"/* s4 */"	vmovaps		%%ymm10,%%ymm11	\n\t"\
	/* [rsi] (and if needed rdi) points to sine components of each sincos pair, which is not really a pair here in terms of relative addressing: */\
		"vmovaps	    (%%rax)	,%%ymm0			\n\t		vmulpd	    (%%r12)	,%%ymm8 ,%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm1			\n\t		vmulpd	    (%%r13)	,%%ymm10,%%ymm10	\n\t"\
		"vmovaps	    (%%rax)	,%%ymm6			\n\t		vmulpd	    (%%r13)	,%%ymm9 ,%%ymm9 	\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm7			\n\t		vmulpd	    (%%r12)	,%%ymm11,%%ymm11	\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm2			\n\t		vsubpd	%%ymm10		,%%ymm8 ,%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm3			\n\t		vaddpd	%%ymm11		,%%ymm9 ,%%ymm9 	\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm4			\n\t		vmovaps	    (%%r11)	,%%ymm10			\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm5			\n\t		vmovaps	0x20(%%r11)	,%%ymm11			\n\t"\
		"vmulpd	    (%%rcx)	,%%ymm2,%%ymm2		\n\t		vmovaps	    (%%r11)	,%%ymm12			\n\t"\
		"vmulpd	    (%%rcx)	,%%ymm3,%%ymm3		\n\t		vmovaps	0x20(%%r11)	,%%ymm13			\n\t"\
		"vmulpd	    (%%rsi)	,%%ymm4,%%ymm4		\n\t		vmulpd	    (%%r14)	,%%ymm10,%%ymm10	\n\t"\
		"vmulpd	    (%%rsi)	,%%ymm5,%%ymm5		\n\t		vmulpd	    (%%r15)	,%%ymm11,%%ymm11	\n\t"\
		"vsubpd	%%ymm5		,%%ymm2,%%ymm2		\n\t		vmulpd	    (%%r15)	,%%ymm12,%%ymm12	\n\t"\
		"vaddpd	%%ymm4		,%%ymm3,%%ymm3		\n\t		vmulpd	    (%%r14)	,%%ymm13,%%ymm13	\n\t"\
		"vaddpd	%%ymm2		,%%ymm0,%%ymm0		\n\t		vsubpd	%%ymm11		,%%ymm10,%%ymm10	\n\t"\
		"vaddpd	%%ymm3		,%%ymm1,%%ymm1		\n\t		vaddpd	%%ymm13		,%%ymm12,%%ymm12	\n\t"\
		"vsubpd	%%ymm2		,%%ymm6,%%ymm6		\n\t		vmovaps	%%ymm10		,%%ymm11			\n\t"\
		"vsubpd	%%ymm3		,%%ymm7,%%ymm7		\n\t		vmovaps	%%ymm12		,%%ymm13			\n\t"\
		"vmovaps	%%ymm0		,    (%%rax)	\n\t		vaddpd	%%ymm8 		,%%ymm10,%%ymm10	\n\t"\
		"vmovaps	%%ymm1		,0x20(%%rax)	\n\t		vsubpd	%%ymm11		,%%ymm8 ,%%ymm8 	\n\t"\
		"vmovaps	%%ymm6		,    (%%rbx)	\n\t		vaddpd	%%ymm9 		,%%ymm12,%%ymm12	\n\t"\
		"vmovaps	%%ymm7		,0x20(%%rbx)	\n\t		vsubpd	%%ymm13		,%%ymm9 ,%%ymm9 	\n\t"\
		"leaq	(%%rax,%%r8,2),%%rax			\n\t		vmovaps	%%ymm10	,    (%%r10)	\n\t"\
		"leaq	(%%rbx,%%r8,2),%%rbx			\n\t		vmovaps	%%ymm12	,0x20(%%r10)	\n\t"\
		"movq	%[twid_ptrs],%%r15	\n\t			leaq	(%%r10,%%r8  ),%%r11		\n\t"\
		"movq 0x10(%%r15),%%rcx \n\t movq 0x18(%%r15),%%rsi	\n\t	vmovaps	%%ymm8,    (%%r11)	\n\t"/* c2,s2 */\
		"movq 0x20(%%r15),%%rdx \n\t movq 0x28(%%r15),%%rdi	\n\t	vmovaps	%%ymm9,0x20(%%r11)	\n\t"/* c6,s6 */\
		"vmovaps	    (%%rax)	,%%ymm0			\n\t		leaq	(%%r10,%%r8,2),%%r10		\n\t"\
		"vmovaps	0x20(%%rax)	,%%ymm2			\n\t		leaq	(%%r11,%%r8,2),%%r11		\n\t"\
		"vmovaps		%%ymm0	,%%ymm1		\n\t	movq 0x50(%%r15),%%r12 \n\t movq 0x58(%%r15),%%r13	\n\t"/* c3,s3 */\
		"vmovaps		%%ymm2	,%%ymm3		\n\t	movq 0x60(%%r15),%%r14 \n\t movq 0x68(%%r15),%%r15	\n\t"/* c7,s7 */\
		"vmulpd	    (%%rcx)	,%%ymm0,%%ymm0		\n\t		vmovaps	    (%%r10)	,%%ymm8 			\n\t"\
		"vmulpd	    (%%rsi)	,%%ymm2,%%ymm2		\n\t		vmovaps	0x20(%%r10)	,%%ymm10			\n\t"\
		"vmulpd	    (%%rsi)	,%%ymm1,%%ymm1		\n\t		vmovaps	    (%%r10)	,%%ymm9 			\n\t"\
		"vmulpd	    (%%rcx)	,%%ymm3,%%ymm3		\n\t		vmovaps	0x20(%%r10)	,%%ymm11			\n\t"\
		"vsubpd	%%ymm2		,%%ymm0,%%ymm0		\n\t		vmulpd	    (%%r12)	,%%ymm8 ,%%ymm8 	\n\t"\
		"vaddpd	%%ymm3		,%%ymm1,%%ymm1		\n\t		vmulpd	    (%%r13)	,%%ymm10,%%ymm10	\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm2			\n\t		vmulpd	    (%%r13)	,%%ymm9 ,%%ymm9 	\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm3			\n\t		vmulpd	    (%%r12)	,%%ymm11,%%ymm11	\n\t"\
		"vmovaps	    (%%rbx)	,%%ymm4			\n\t		vsubpd	%%ymm10		,%%ymm8 ,%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rbx)	,%%ymm5			\n\t		vaddpd	%%ymm11		,%%ymm9 ,%%ymm9 	\n\t"\
		"vmulpd	    (%%rdx)	,%%ymm2,%%ymm2		\n\t		vmovaps	    (%%r11)	,%%ymm10			\n\t"\
		"vmulpd	    (%%rdi)	,%%ymm3,%%ymm3		\n\t		vmovaps	0x20(%%r11)	,%%ymm11			\n\t"\
		"vmulpd	    (%%rdi)	,%%ymm4,%%ymm4		\n\t		vmovaps	    (%%r11)	,%%ymm12			\n\t"\
		"vmulpd	    (%%rdx)	,%%ymm5,%%ymm5		\n\t		vmovaps	0x20(%%r11)	,%%ymm13			\n\t"\
		"vsubpd	%%ymm3		,%%ymm2,%%ymm2		\n\t		vmulpd	    (%%r14)	,%%ymm10,%%ymm10	\n\t"\
		"vaddpd	%%ymm5		,%%ymm4,%%ymm4		\n\t		vmulpd	    (%%r15)	,%%ymm11,%%ymm11	\n\t"\
		"vmovaps	%%ymm2		,%%ymm3			\n\t		vmulpd	    (%%r15)	,%%ymm12,%%ymm12	\n\t"\
		"vmovaps	%%ymm4		,%%ymm5			\n\t		vmulpd	    (%%r14)	,%%ymm13,%%ymm13	\n\t"\
		"vaddpd	%%ymm0		,%%ymm2,%%ymm2		\n\t		vsubpd	%%ymm11		,%%ymm10,%%ymm10	\n\t"\
		"vsubpd	%%ymm3		,%%ymm0,%%ymm0		\n\t		vaddpd	%%ymm13		,%%ymm12,%%ymm12	\n\t"\
		"vaddpd	%%ymm1		,%%ymm4,%%ymm4		\n\t		vmovaps	%%ymm10		,%%ymm11			\n\t"\
		"vsubpd	%%ymm5		,%%ymm1,%%ymm1		\n\t		vmovaps	%%ymm12		,%%ymm13			\n\t"\
		"vmovaps	%%ymm2		,    (%%rax)	\n\t		vaddpd	%%ymm8 		,%%ymm10,%%ymm10	\n\t"\
		"vmovaps	%%ymm4		,0x20(%%rax)	\n\t		vsubpd	%%ymm11		,%%ymm8 ,%%ymm8 	\n\t"\
		"vmovaps	%%ymm0		,    (%%rbx)	\n\t		vaddpd	%%ymm9 		,%%ymm12,%%ymm12	\n\t"\
		"vmovaps	%%ymm1		,0x20(%%rbx)	\n\t		vsubpd	%%ymm13		,%%ymm9 ,%%ymm9 	\n\t"\
		"													vmovaps	%%ymm10		,    (%%r10)	\n\t"\
		"													vmovaps	%%ymm12		,0x20(%%r10)	\n\t"\
		"													vmovaps	%%ymm8 		,    (%%r11)	\n\t"\
		"													vmovaps	%%ymm9 		,0x20(%%r11)	\n\t"\
	/* combine to get 2 length-4 output subtransforms.
	In this step 2 of the 8-dft, we need address-pairs
		lcol:		rcol:
		i0,2,1,3	i4,6,5,7
		o0,2,1,3	o4,6,5,7
	At this point r[a|b]x have i2,3, r1[0|1] have i6,7, but cleaner to reload add0 and go from there.
	Since we will be loading o-addresses into regs starting with r[a|b]x and r1[0|1], use r[c|d]x and r1[2|3]
	for the I-address pairs here: */\
		"movq	%[in0],%%rcx				\n\t		leaq	(%%rcx,%%r8  ),%%r12	\n\t"/* [lcol,rcol] base-addresses = in0 + [0,1*istride] */\
		"leaq	(%%rcx,%%r8,2),%%rdx		\n\t		leaq	(%%r12,%%r8,2),%%r13	\n\t"/* in0 + [2,3*istride] */\
		"vmovaps	    (%%rcx)	,%%ymm0		\n\t		vmovaps	    (%%r12)	,%%ymm8 	\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm1		\n\t		vmovaps	0x20(%%r12)	,%%ymm9 	\n\t"\
		"vmovaps	%%ymm0		,%%ymm4			\n\t		vmovaps	%%ymm8 		,%%ymm12			\n\t"\
		"vmovaps	%%ymm1		,%%ymm5			\n\t		vmovaps	%%ymm9 		,%%ymm13			\n\t"\
		"movq	%[out0]	,%%rsi			\n\t	movq	%[off]	,%%rdi			\n\t"/* Load output base-address into rsi and offset-array pointer into rdi */\
		"movslq		    (%%rdi),%%rax	\n\t	movslq		0x10(%%rdi),%%r10	\n\t"/*        off[0,4] */\
		"leaq	(%%rsi,%%rax,8),%%rax	\n\t	leaq	(%%rsi,%%r10,8),%%r10	\n\t"/* out0 + off[0,4] */\
		"vaddpd	    (%%rdx)	,%%ymm0,%%ymm0		\n\t		vsubpd	0x20(%%r13)	,%%ymm8 ,%%ymm8 	\n\t"\
		"vsubpd	    (%%rdx)	,%%ymm4,%%ymm4		\n\t		vaddpd	0x20(%%r13)	,%%ymm12,%%ymm12	\n\t"\
		"vaddpd	0x20(%%rdx)	,%%ymm1,%%ymm1		\n\t		vaddpd	    (%%r13)	,%%ymm9 ,%%ymm9 	\n\t"\
		"vsubpd	0x20(%%rdx)	,%%ymm5,%%ymm5		\n\t		vsubpd	    (%%r13)	,%%ymm13,%%ymm13	\n\t"\
		"movslq		0x08(%%rdi),%%rbx	\n\t	movslq		0x18(%%rdi),%%r11	\n\t"\
		"leaq	(%%rsi,%%rbx,8),%%rbx	\n\t	leaq	(%%rsi,%%r11,8),%%r11	\n\t"/* out0 + off[2,6] */\
		"vmovaps	%%ymm0		,    (%%rax)	\n\t		vmovaps	%%ymm8 		,    (%%r10)	\n\t"\
		"vmovaps	%%ymm1		,0x20(%%rax)	\n\t		vmovaps	%%ymm9 		,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm4		,    (%%rbx)	\n\t		vmovaps	%%ymm12		,    (%%r11)	\n\t"\
		"vmovaps	%%ymm5		,0x20(%%rbx)	\n\t		vmovaps	%%ymm13		,0x20(%%r11)	\n\t"\
		"leaq	(%%rcx,%%r8,4),%%rcx		\n\t		leaq	(%%r12,%%r8,4),%%r12	\n\t"/* in0 + [4,5*istride] */\
		"leaq	(%%rdx,%%r8,4),%%rdx		\n\t		leaq	(%%r13,%%r8,4),%%r13	\n\t"/* in0 + [6,7*istride] */\
		"vmovaps	    (%%rcx)	,%%ymm2			\n\t		vmovaps	    (%%r12)	,%%ymm10			\n\t"\
		"vmovaps	0x20(%%rcx)	,%%ymm3			\n\t		vmovaps	0x20(%%r12)	,%%ymm11			\n\t"\
		"movslq		0x04(%%rdi),%%rcx	\n\t	movslq		0x14(%%rdi),%%r12	\n\t"\
		"leaq	(%%rsi,%%rcx,8),%%rcx	\n\t	leaq	(%%rsi,%%r12,8),%%r12	\n\t"/* out0 + off[1,5] */\
		"vmovaps	%%ymm2		,%%ymm6			\n\t		vmovaps	%%ymm10		,%%ymm14			\n\t"\
		"vmovaps	%%ymm3		,%%ymm7			\n\t		vmovaps	%%ymm11		,%%ymm15			\n\t"\
		"vaddpd	    (%%rdx)	,%%ymm2,%%ymm2		\n\t		vsubpd	0x20(%%r13)	,%%ymm10,%%ymm10	\n\t"\
		"vsubpd	    (%%rdx)	,%%ymm6,%%ymm6		\n\t		vaddpd	0x20(%%r13)	,%%ymm14,%%ymm14	\n\t"\
		"vaddpd	0x20(%%rdx)	,%%ymm3,%%ymm3		\n\t		vaddpd	    (%%r13)	,%%ymm11,%%ymm11	\n\t"\
		"vsubpd	0x20(%%rdx)	,%%ymm7,%%ymm7		\n\t		vsubpd	    (%%r13)	,%%ymm15,%%ymm15	\n\t"\
		"movslq		0x0c(%%rdi),%%rdx	\n\t	movslq		0x1c(%%rdi),%%r13	\n\t"\
		"leaq	(%%rsi,%%rdx,8),%%rdx	\n\t	leaq	(%%rsi,%%r13,8),%%r13	\n\t"/* out0 + off[3,7] */\
		"vsubpd	%%ymm2		,%%ymm0,%%ymm0		\n\t		vmovaps	%%ymm12		,    (%%r13)	\n\t"\
		"vsubpd	%%ymm3		,%%ymm1,%%ymm1		\n\t		vmovaps	%%ymm13		,0x20(%%r13)	\n\t"\
	/* Use the cosine term of the [c1,s1] pair, which is the *middle* [4th of 7] of our 7 input pairs, in terms \
	of the input-arg bit-reversal reordering defined in the __X[c,s] --> [c,s] mapping below and happens to \
	always in fact *be* a true cosine term, which is a requirement for our "decr 1 gives isrt2" data-copy scheme: */\
		"movq	%[twid_ptrs],%%r14		\n\t	movq	0x30(%%r14),%%r14	\n\t"\
		"vsubpd	%%ymm7		,%%ymm4,%%ymm4		\n\t	subq	$0x20,%%r14	\n\t"/* isrt2 in [c1]-1 */\
		"vsubpd	%%ymm6		,%%ymm5,%%ymm5		\n\t		vmovaps	%%ymm10		,%%ymm13			\n\t"\
		"vaddpd	    (%%rax)	,%%ymm2,%%ymm2		\n\t		vsubpd	%%ymm11		,%%ymm10,%%ymm10	\n\t"\
		"vaddpd	0x20(%%rax)	,%%ymm3,%%ymm3		\n\t		vaddpd	%%ymm11		,%%ymm13,%%ymm13	\n\t"\
		"vaddpd	    (%%rbx)	,%%ymm7,%%ymm7		\n\t		vmulpd	    (%%r14)	,%%ymm10,%%ymm10	\n\t"\
		"vaddpd	0x20(%%rbx)	,%%ymm6,%%ymm6		\n\t		vmulpd	    (%%r14)	,%%ymm13,%%ymm13	\n\t"\
		"vmovaps	%%ymm2,    (%%rax)	/* [o0].re */\n\t	vmovaps	0x20(%%r13)	,%%ymm11			\n\t"\
		"vmovaps	%%ymm3,0x20(%%rax)	/* [o0].im */\n\t	vmovaps	%%ymm15		,%%ymm12			\n\t"\
		"vmovaps	%%ymm4,    (%%rbx)	/* [o2].re */\n\t	vaddpd	%%ymm14		,%%ymm12,%%ymm12	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rbx)	/* [o2].im */\n\t	vsubpd	%%ymm14		,%%ymm15,%%ymm15	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	/* [o1].re */\n\t	vmulpd	    (%%r14)	,%%ymm12,%%ymm12	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)	/* [o1].im */\n\t	vmulpd	    (%%r14)	,%%ymm15,%%ymm15	\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	/* [o3].re */\n\t	vmovaps		(%%r13)	,%%ymm14			\n\t"\
		"vmovaps	%%ymm5,0x20(%%rdx)	/* [o3].im */\n\t	vsubpd	%%ymm10		,%%ymm8 ,%%ymm8 	\n\t"\
		"													vsubpd	%%ymm13		,%%ymm9 ,%%ymm9 	\n\t"\
		"													vsubpd	%%ymm12		,%%ymm14,%%ymm14	\n\t"\
		"													vsubpd	%%ymm15		,%%ymm11,%%ymm11	\n\t"\
		"													vaddpd	    (%%r10)	,%%ymm10,%%ymm10	\n\t"\
		"													vaddpd	0x20(%%r10)	,%%ymm13,%%ymm13	\n\t"\
		"													vaddpd	    (%%r11)	,%%ymm12,%%ymm12	\n\t"\
		"													vaddpd	0x20(%%r11)	,%%ymm15,%%ymm15	\n\t"\
		"													vmovaps	%%ymm10,    (%%r10)	\n\t"/* [o4].re */\
		"													vmovaps	%%ymm13,0x20(%%r10)	\n\t"/* [o4].im */\
		"													vmovaps	%%ymm14,    (%%r11)	\n\t"/* [o6].re */\
		"													vmovaps	%%ymm11,0x20(%%r11)	\n\t"/* [o6].im */\
		"													vmovaps	%%ymm8 ,    (%%r12)	\n\t"/* [o5].re */\
		"													vmovaps	%%ymm9 ,0x20(%%r12)	\n\t"/* [o5].im */\
		"													vmovaps	%%ymm12,    (%%r13)	\n\t"/* [o7].re */\
		"													vmovaps	%%ymm15,0x20(%%r13)	\n\t"/* [o7].im */\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "e" (Xi1)	/* ...except for 'e'-inputs which are literal byte offsets */\
		 ,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		 ,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Twiddleless version of SSE2_RADIX8_DIT_TWIDDLE. Inputs enter in memory locations __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7.
	Outputs go into 16 contiguous 32-byte memory locations starting at __out and assumed disjoint with inputs.
	This macro built on the same code template as SSE2_RADIX8_DIF_TWIDDLE0, but with the I/O-location indices mutually bit reversed:
	01234567 <--> 04261537, which can be effected via the pairwise swaps 1 <--> 4 and 3 <--> 6.
	*/
	#define	SSE2_RADIX8_DIT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xout, Xisrt2)\
	{\
	__asm__ volatile (\
		"/* 1st of 2 radix-4 subtransforms, data in ymm0-7: */\n\t	/* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\n\t"\
		"movq	%[__i0],%%rax					\n\t		movq	%[__i4],%%r10					\n\t"\
		"movq	%[__i1],%%rbx					\n\t		movq	%[__i5],%%r11					\n\t"\
		"movq	%[__i2],%%rcx					\n\t		movq	%[__i6],%%r12					\n\t"\
		"movq	%[__i3],%%rdx					\n\t		movq	%[__i7],%%r13					\n\t"\
		"										\n\t		/* p1,5 combo: x+-y into ymm8/1, 10/3, resp: */	\n\t"\
		"/* p0,4 combo: x+-y into ymm0/1, 2/3, resp: */\n\tvmovaps	     (%%r11),%%ymm8 			\n\t"\
		"										\n\t		vmovaps	0x020(%%r11),%%ymm9 			\n\t"\
		"vmovaps	     (%%rbx),%%ymm0			\n\t		vmovaps	     (%%r10),%%ymm10			\n\t"\
		"vmovaps	0x020(%%rbx),%%ymm1			\n\t		vmovaps	0x020(%%r10),%%ymm11			\n\t"\
		"vmovaps	     (%%rax),%%ymm2			\n\t		vsubpd	%%ymm8 ,%%ymm10,%%ymm10			\n\t"\
		"vmovaps	0x020(%%rax),%%ymm3			\n\t		vsubpd	%%ymm9 ,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2			\n\t		vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 			\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3			\n\t		vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0			\n\t		vaddpd	%%ymm10,%%ymm8 ,%%ymm8 			\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1			\n\t		vaddpd	%%ymm11,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0			\n\t		/* p3,7 combo: x+-y into ymm14/7, 12/5, resp: */	\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1			\n\t		vmovaps	     (%%r12),%%ymm12			\n\t"\
		"										\n\t		vmovaps	0x020(%%r12),%%ymm13			\n\t"\
		"/* p2,6 combo: x+-y into ymm4/5, 6/7, resp: */\n\t	vmovaps	     (%%r13),%%ymm14			\n\t"\
		"										\n\t		vmovaps	0x020(%%r13),%%ymm15			\n\t"\
		"vmovaps	     (%%rdx),%%ymm4			\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vmovaps	0x020(%%rdx),%%ymm5			\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13			\n\t"\
		"vmovaps	     (%%rcx),%%ymm6			\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t"\
		"vmovaps	0x020(%%rcx),%%ymm7			\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6			\n\t		vaddpd	%%ymm12,%%ymm14,%%ymm14			\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7			\n\t		vaddpd	%%ymm13,%%ymm15,%%ymm15			\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t		/* Finish radix-4 butterfly, tmp-store 1st of 4 outputs to free up 2 registers: */\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t		vsubpd	%%ymm14,%%ymm8 ,%%ymm8 			\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4			\n\t		vsubpd	%%ymm15,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5			\n\t		vsubpd	%%ymm13,%%ymm10,%%ymm10			\n\t"\
		"										\n\t		vsubpd	%%ymm12,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2			\n\t		vaddpd	%%ymm13,%%ymm13,%%ymm13			\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1			\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3			\n\t		vaddpd	%%ymm12,%%ymm12,%%ymm12			\n\t"\
		"													vaddpd	%%ymm8 ,%%ymm14,%%ymm14			\n\t"\
		"													vaddpd	%%ymm10,%%ymm13,%%ymm13			\n\t"\
		"													vaddpd	%%ymm9 ,%%ymm15,%%ymm15			\n\t"\
		"													vaddpd	%%ymm11,%%ymm12,%%ymm12			\n\t"\
		"													movq	%[__isrt2],%%rsi	/* isrt2 */	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t		vmovaps	%%ymm14,     (%%rax)	/* spill*/	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7			\n\t		vmovaps	%%ymm15,0x020(%%rax)	/* spill*/	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t		vmovaps	%%ymm10,%%ymm14					\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6			\n\t		vmovaps	%%ymm13,%%ymm15					\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4			\n\t		vsubpd	%%ymm12,%%ymm10,%%ymm10			\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7			\n\t		vsubpd	%%ymm11,%%ymm13,%%ymm13			\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5			\n\t		vaddpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6			\n\t		vaddpd	%%ymm15,%%ymm11,%%ymm11			\n\t"\
		"													vmovaps	(%%rsi),%%ymm14		/* isrt2 */	\n\t"\
		"													vmulpd	%%ymm14,%%ymm10,%%ymm10			\n\t"\
		"													vmulpd	%%ymm14,%%ymm13,%%ymm13			\n\t"\
		"													vmulpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"													vmulpd	%%ymm14,%%ymm11,%%ymm11			\n\t"\
		"/* Combine radix-4 subtransforms and write outputs: */\n\t"\
		"\n\t"\
		"vmovaps      (%%rax),%%ymm14/* reload spill */\n\t	vsubpd   %%ymm10,%%ymm2	,%%ymm2			\n\t"\
		"vmovaps 0x020(%%rax),%%ymm15/* reload spill */\n\t	vsubpd   %%ymm12,%%ymm6	,%%ymm6			\n\t"\
		"													vaddpd   %%ymm10,%%ymm10,%%ymm10		\n\t"\
		"movq	%[__out],%%rax					\n\t		vaddpd   %%ymm12,%%ymm12,%%ymm12		\n\t"\
		"										\n\t		vaddpd   %%ymm2 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd   %%ymm11,%%ymm7 ,%%ymm7 		\n\t		vaddpd   %%ymm6 ,%%ymm12,%%ymm12		\n\t"\
		"vsubpd   %%ymm13,%%ymm3 ,%%ymm3 		\n\t"\
		"vaddpd   %%ymm11,%%ymm11,%%ymm11		\n\t		vmovaps	%%ymm2 ,0x140(%%rax)	/* o5r */	\n\t"\
		"vaddpd   %%ymm13,%%ymm13,%%ymm13		\n\t		vmovaps	%%ymm6 ,0x160(%%rax)	/* o5i */	\n\t"\
		"vaddpd   %%ymm7 ,%%ymm11,%%ymm11		\n\t		vmovaps	%%ymm10,0x040(%%rax)	/* o1r */	\n\t"\
		"vaddpd   %%ymm3 ,%%ymm13,%%ymm13		\n\t		vmovaps	%%ymm12,0x060(%%rax)	/* o1i */	\n\t"\
		"										\n\t"\
		"vmovaps	%%ymm7 ,0x0c0(%%rax)	/* o3r */\n\t"\
		"vmovaps	%%ymm3 ,0x1e0(%%rax)	/* o7i */\n\t"\
		"vmovaps	%%ymm11,0x1c0(%%rax)	/* o7r */\n\t"\
		"vmovaps	%%ymm13,0x0e0(%%rax)	/* o3i */\n\t"\
		"										\n\t"\
		"vsubpd	%%ymm14,%%ymm4 ,%%ymm4 			\n\t"\
		"vsubpd	%%ymm15,%%ymm5 ,%%ymm5 			\n\t"\
		"vsubpd	%%ymm9 ,%%ymm0 ,%%ymm0 			\n\t"\
		"vsubpd	%%ymm8 ,%%ymm1 ,%%ymm1 			\n\t"\
		"vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t		vmovaps	%%ymm4 ,0x100(%%rax)	/* o4r */	\n\t"\
		"vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t		vmovaps	%%ymm5 ,0x120(%%rax)	/* o4i */	\n\t"\
		"vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 			\n\t		vmovaps	%%ymm0 ,0x080(%%rax)	/* o2r */	\n\t"\
		"vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 			\n\t		vmovaps	%%ymm1 ,0x1a0(%%rax)	/* o6i */	\n\t"\
		"vaddpd	%%ymm4 ,%%ymm14,%%ymm14			\n\t"\
		"vaddpd	%%ymm5 ,%%ymm15,%%ymm15			\n\t"\
		"vaddpd	%%ymm0 ,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm1 ,%%ymm8 ,%%ymm8 			\n\t"\
		"										\n\t"\
		"vmovaps	%%ymm14,     (%%rax)	/* o0r */\n\t"\
		"vmovaps	%%ymm15,0x020(%%rax)	/* o0r */\n\t"\
		"vmovaps	%%ymm9 ,0x180(%%rax)	/* o6r */\n\t"\
		"vmovaps	%%ymm8 ,0x0a0(%%rax)	/* o2i */\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__out] "m" (Xout)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Same as SSE2_RADIX8_DIT_0TWIDDLE but with user-specifiable [i.e. not nec. contiguous] output addresses:
	#define	SSE2_RADIX8_DIT_0TWIDDLE_OOP(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
	{\
	__asm__ volatile (\
		"/* 1st of 2 radix-4 subtransforms, data in ymm0-7: */\n\t	/* 2nd of 2 radix-4 subtransforms, data in ymm8-15: */\n\t"\
		"movq	%[__i0],%%rax					\n\t		movq	%[__i4],%%r10					\n\t"\
		"movq	%[__i1],%%rbx					\n\t		movq	%[__i5],%%r11					\n\t"\
		"movq	%[__i2],%%rcx					\n\t		movq	%[__i6],%%r12					\n\t"\
		"movq	%[__i3],%%rdx					\n\t		movq	%[__i7],%%r13					\n\t"\
		"										\n\t		/* p1,5 combo: x+-y into ymm8/1, 10/3, resp: */	\n\t"\
		"/* p0,4 combo: x+-y into ymm0/1, 2/3, resp: */\n\tvmovaps	     (%%r11),%%ymm8 			\n\t"\
		"										\n\t		vmovaps	0x020(%%r11),%%ymm9 			\n\t"\
		"vmovaps	     (%%rbx),%%ymm0			\n\t		vmovaps	     (%%r10),%%ymm10			\n\t"\
		"vmovaps	0x020(%%rbx),%%ymm1			\n\t		vmovaps	0x020(%%r10),%%ymm11			\n\t"\
		"vmovaps	     (%%rax),%%ymm2			\n\t		vsubpd	%%ymm8 ,%%ymm10,%%ymm10			\n\t"\
		"vmovaps	0x020(%%rax),%%ymm3			\n\t		vsubpd	%%ymm9 ,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2			\n\t		vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 			\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3			\n\t		vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0			\n\t		vaddpd	%%ymm10,%%ymm8 ,%%ymm8 			\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1			\n\t		vaddpd	%%ymm11,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0			\n\t		/* p3,7 combo: x+-y into ymm14/7, 12/5, resp: */	\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1			\n\t		vmovaps	     (%%r12),%%ymm12			\n\t"\
		"										\n\t		vmovaps	0x020(%%r12),%%ymm13			\n\t"\
		"/* p2,6 combo: x+-y into ymm4/5, 6/7, resp: */\n\t	vmovaps	     (%%r13),%%ymm14			\n\t"\
		"										\n\t		vmovaps	0x020(%%r13),%%ymm15			\n\t"\
		"vmovaps	     (%%rdx),%%ymm4			\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vmovaps	0x020(%%rdx),%%ymm5			\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13			\n\t"\
		"vmovaps	     (%%rcx),%%ymm6			\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t"\
		"vmovaps	0x020(%%rcx),%%ymm7			\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6			\n\t		vaddpd	%%ymm12,%%ymm14,%%ymm14			\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7			\n\t		vaddpd	%%ymm13,%%ymm15,%%ymm15			\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t		/* Finish radix-4 butterfly, tmp-store 1st of 4 outputs to free up 2 registers: */\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t		vsubpd	%%ymm14,%%ymm8 ,%%ymm8 			\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4			\n\t		vsubpd	%%ymm15,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5			\n\t		vsubpd	%%ymm13,%%ymm10,%%ymm10			\n\t"\
		"										\n\t		vsubpd	%%ymm12,%%ymm11,%%ymm11			\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2			\n\t		vaddpd	%%ymm13,%%ymm13,%%ymm13			\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1			\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3			\n\t		vaddpd	%%ymm12,%%ymm12,%%ymm12			\n\t"\
		"													vaddpd	%%ymm8 ,%%ymm14,%%ymm14			\n\t"\
		"													vaddpd	%%ymm10,%%ymm13,%%ymm13			\n\t"\
		"													vaddpd	%%ymm9 ,%%ymm15,%%ymm15			\n\t"\
		"													vaddpd	%%ymm11,%%ymm12,%%ymm12			\n\t"\
		"													movq	%[__isrt2],%%rsi	/* isrt2 */	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t		vmovaps	%%ymm14,     (%%rax)	/* spill*/	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7			\n\t		vmovaps	%%ymm15,0x020(%%rax)	/* spill*/	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t		vmovaps	%%ymm10,%%ymm14					\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6			\n\t		vmovaps	%%ymm13,%%ymm15					\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4			\n\t		vsubpd	%%ymm12,%%ymm10,%%ymm10			\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7			\n\t		vsubpd	%%ymm11,%%ymm13,%%ymm13			\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5			\n\t		vaddpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6			\n\t		vaddpd	%%ymm15,%%ymm11,%%ymm11			\n\t"\
		"													vmovaps	(%%rsi),%%ymm14		/* isrt2 */	\n\t"\
		"													vmulpd	%%ymm14,%%ymm10,%%ymm10			\n\t"\
		"													vmulpd	%%ymm14,%%ymm13,%%ymm13			\n\t"\
		"													vmulpd	%%ymm14,%%ymm12,%%ymm12			\n\t"\
		"													vmulpd	%%ymm14,%%ymm11,%%ymm11			\n\t"\
		"/* Combine radix-4 subtransforms and write outputs: */\n\t"\
		"\n\t"\
		"vmovaps      (%%rax),%%ymm14/* reload spill */\n\t	vsubpd   %%ymm10,%%ymm2	,%%ymm2			\n\t"\
		"vmovaps 0x020(%%rax),%%ymm15/* reload spill */\n\t	vsubpd   %%ymm12,%%ymm6	,%%ymm6			\n\t"\
		"movq	%[__o1],%%rax					\n\t		movq	%[__o5],%%rcx					\n\t"\
		"													vaddpd   %%ymm10,%%ymm10,%%ymm10		\n\t"\
		"													vaddpd   %%ymm12,%%ymm12,%%ymm12		\n\t"\
		"										\n\t		vaddpd   %%ymm2 ,%%ymm10,%%ymm10		\n\t"\
		"vsubpd   %%ymm11,%%ymm7 ,%%ymm7 		\n\t		vaddpd   %%ymm6 ,%%ymm12,%%ymm12		\n\t"\
		"vsubpd   %%ymm13,%%ymm3 ,%%ymm3 		\n\t"\
		"movq	%[__o3],%%rbx					\n\t		movq	%[__o7],%%rdx					\n\t"\
		"vaddpd   %%ymm11,%%ymm11,%%ymm11		\n\t		vmovaps	%%ymm2 ,    (%%rcx)	/* o5r */	\n\t"\
		"vaddpd   %%ymm13,%%ymm13,%%ymm13		\n\t		vmovaps	%%ymm6 ,0x20(%%rcx)	/* o5i */	\n\t"\
		"vaddpd   %%ymm7 ,%%ymm11,%%ymm11		\n\t		vmovaps	%%ymm10,    (%%rax)	/* o1r */	\n\t"\
		"vaddpd   %%ymm3 ,%%ymm13,%%ymm13		\n\t		vmovaps	%%ymm12,0x20(%%rax)	/* o1i */	\n\t"\
		"movq	%[__o0],%%rax					\n\t		movq	%[__o4],%%rcx					\n\t"\
		"										\n\t"\
		"vmovaps	%%ymm7 ,    (%%rbx)	/* o3r */	\n\t"\
		"vmovaps	%%ymm3 ,0x20(%%rdx)	/* o7i */	\n\t"\
		"vmovaps	%%ymm11,    (%%rdx)	/* o7r */	\n\t"\
		"vmovaps	%%ymm13,0x20(%%rbx)	/* o3i */	\n\t"\
		"										\n\t"\
		"movq	%[__o2],%%rbx					\n\t		movq	%[__o6],%%rdx					\n\t"\
		"vsubpd	%%ymm14,%%ymm4 ,%%ymm4 			\n\t"\
		"vsubpd	%%ymm15,%%ymm5 ,%%ymm5 			\n\t"\
		"vsubpd	%%ymm9 ,%%ymm0 ,%%ymm0 			\n\t"\
		"vsubpd	%%ymm8 ,%%ymm1 ,%%ymm1 			\n\t"\
		"vaddpd	%%ymm14,%%ymm14,%%ymm14			\n\t		vmovaps	%%ymm4 ,    (%%rcx)	/* o4r */	\n\t"\
		"vaddpd	%%ymm15,%%ymm15,%%ymm15			\n\t		vmovaps	%%ymm5 ,0x20(%%rcx)	/* o4i */	\n\t"\
		"vaddpd	%%ymm9 ,%%ymm9 ,%%ymm9 			\n\t		vmovaps	%%ymm0 ,    (%%rbx)	/* o2r */	\n\t"\
		"vaddpd	%%ymm8 ,%%ymm8 ,%%ymm8 			\n\t		vmovaps	%%ymm1 ,0x20(%%rdx)	/* o6i */	\n\t"\
		"vaddpd	%%ymm4 ,%%ymm14,%%ymm14			\n\t"\
		"vaddpd	%%ymm5 ,%%ymm15,%%ymm15			\n\t"\
		"vaddpd	%%ymm0 ,%%ymm9 ,%%ymm9 			\n\t"\
		"vaddpd	%%ymm1 ,%%ymm8 ,%%ymm8 			\n\t"\
		"										\n\t"\
		"vmovaps	%%ymm14,    (%%rax)	/* o0r */	\n\t"\
		"vmovaps	%%ymm15,0x20(%%rax)	/* o0i */	\n\t"\
		"vmovaps	%%ymm9 ,    (%%rdx)	/* o6r */	\n\t"\
		"vmovaps	%%ymm8 ,0x20(%%rbx)	/* o2i */	\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// AVX Opcount: 84 vec MEM [30 implicit], 66 ADD/SUB, 50 MUL.
	#define SSE2_RADIX8_DIT_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xo_off, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
	/* i1 is base byte-offset, no need to lshift it prior to add: */\
		"xorq	%%r8,%%r8	\n\t	leaq	%c[i1](%%r8),%%r8	\n\t"/* movq|movslq of literal %c[i1] both segfaulted, workaround via LEA */\
		/* The twid_ptrs[] array holds ptrs to 14 complex twiddles in-order: (c,s)[1,2,3,4,5,6,7]: */\
		"movq	%[twid_ptrs],%%r14	\n\t"\
	/* Block 0/1 has just one twiddle-CMUL: 												/* Blocks 2/3 use separate register subset, can be done overlapped with 0/1: */\
	"movq		%[in0],%%rax		\n\t"\
		"leaq	(%%rax,%%r8  ),%%rbx	\n\t"\
		"leaq	(%%rax,%%r8,2),%%rcx	\n\t"\
		"leaq	(%%rcx,%%r8  ),%%rdx	\n\t												movq	0x10(%%r14),%%r10				\n\t"/* c2 */\
	"movq	    (%%r14),%%rdi	\n\t/* [rdi,rsi] -> [c,s] components of each sincos pair, */movq	0x20(%%r14),%%r12				\n\t"/* c3 */\
	"movq	0x08(%%r14),%%rsi	\n\t/* (not truly a pair here in terms of rel-addresses). */movq	0x18(%%r14),%%r11				\n\t"/* s2 */\
	"vmovaps	    (%%rbx),%%ymm4 		\n\t	vmovaps		0x20(%%rbx),%%ymm5 		\n\t	movq	0x28(%%r14),%%r13				\n\t"/* s3 */\
	"vmovaps	    (%%rax),%%ymm0 		\n\t	vmovaps		0x20(%%rax),%%ymm1 		\n\t	vmovaps		(%%rcx),%%ymm8 			\n\t	vmovaps		0x20(%%rcx),%%ymm9 		\n\t"\
	"vmovaps	%%ymm5 ,%%ymm6 			\n\t	vmovaps		%%ymm4 ,%%ymm7 			\n\t	vmovaps	%%ymm9 ,%%ymm10				\n\t	vmovaps		%%ymm8 ,%%ymm11			\n\t"\
	"vmulpd		(%%rdi),%%ymm4 ,%%ymm4 	\n\t	vmulpd		(%%rdi),%%ymm5 ,%%ymm5 	\n\t	vmovaps		(%%rdx),%%ymm12			\n\t	vmovaps		0x20(%%rdx),%%ymm13		\n\t"\
	"vmulpd		(%%rsi),%%ymm6 ,%%ymm6 	\n\t	vmulpd		(%%rsi),%%ymm7 ,%%ymm7 	\n\t	vmovaps	%%ymm13,%%ymm14				\n\t	vmovaps		%%ymm12,%%ymm15			\n\t"\
	"vaddpd		%%ymm6 ,%%ymm4 ,%%ymm4 	\n\t	vsubpd		%%ymm7 ,%%ymm5 ,%%ymm5 	\n\t	vmulpd		(%%r10),%%ymm8 ,%%ymm8 	\n\t	vmulpd		(%%r10),%%ymm9 ,%%ymm9 	\n\t"\
	"vmovaps	%%ymm0 ,%%ymm2 			\n\t	vmovaps		%%ymm1 ,%%ymm3 			\n\t	vmulpd		(%%r12),%%ymm12,%%ymm12	\n\t	vmulpd		(%%r12),%%ymm13,%%ymm13	\n\t"\
	"vaddpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t	vaddpd		%%ymm5 ,%%ymm1 ,%%ymm1 	\n\t	vmulpd		(%%r11),%%ymm10,%%ymm10	\n\t	vmulpd		(%%r11),%%ymm11,%%ymm11	\n\t"\
	"vsubpd		%%ymm4 ,%%ymm2 ,%%ymm2 	\n\t	vsubpd		%%ymm5 ,%%ymm3 ,%%ymm3 	\n\t	vmulpd		(%%r13),%%ymm14,%%ymm14	\n\t	vmulpd		(%%r13),%%ymm15,%%ymm15	\n\t"\
	"vmovaps	%%ymm0 ,    (%%rax)		\n\t	vmovaps		%%ymm1 ,0x20(%%rax)		\n\t	vaddpd		%%ymm10,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm11,%%ymm9 ,%%ymm9 	\n\t"\
	"vmovaps	%%ymm2 ,    (%%rbx)		\n\t	vmovaps		%%ymm3 ,0x20(%%rbx)		\n\t	vaddpd		%%ymm14,%%ymm12,%%ymm12	\n\t	vsubpd		%%ymm15,%%ymm13,%%ymm13	\n\t"\
																							/* Now do radix-2 butterfly: */\
																						"	vmovaps		%%ymm8 ,%%ymm10			\n\t	vmovaps		%%ymm9 ,%%ymm11			\n\t"\
																						"	vaddpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vaddpd		%%ymm13,%%ymm9 ,%%ymm9 	\n\t"\
																						"	vsubpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vsubpd		%%ymm13,%%ymm11,%%ymm11	\n\t"\
																						"	vmovaps		%%ymm8 ,    (%%rcx)		\n\t	vmovaps		%%ymm9 ,0x20(%%rcx)		\n\t"\
																						"	vmovaps		%%ymm10,    (%%rdx)		\n\t	vmovaps		%%ymm11,0x20(%%rdx)		\n\t"\
	/* Blocks 4/5: */																		/* Blocks 6/7 use separate register subset, can be done overlapped with 4/5: */\
	"shlq	$2,%%r8			\n\t"/* From here on only need offset i4 = 4*i1 */\
	"addq	%%r8,%%rax		\n\t"/* Remaining 4 I-address-calculations are in-place += i4, so use ADD, faster than LEA */\
	"addq	%%r8,%%rbx		\n\t"\
	"addq	%%r8,%%rcx		\n\t"\
	"addq	%%r8,%%rdx		\n\t"\
	"vmovaps		(%%rax),%%ymm0 		\n\t	vmovaps		0x20(%%rax),%%ymm1 		\n\t	vmovaps		(%%rcx),%%ymm8 			\n\t	vmovaps		0x20(%%rcx),%%ymm9 		\n\t"\
	"vmovaps	%%ymm1 ,%%ymm2 			\n\t	vmovaps		%%ymm0 ,%%ymm3 			\n\t	vmovaps		%%ymm9 ,%%ymm10			\n\t	vmovaps		%%ymm8 ,%%ymm11			\n\t"\
	"vmovaps		(%%rbx),%%ymm4 		\n\t	vmovaps		0x20(%%rbx),%%ymm5 		\n\t	vmovaps		(%%rdx),%%ymm12			\n\t	vmovaps		0x20(%%rdx),%%ymm13		\n\t"\
	"vmovaps	%%ymm5 ,%%ymm6 			\n\t	vmovaps		%%ymm4 ,%%ymm7 			\n\t	vmovaps		%%ymm13,%%ymm14			\n\t	vmovaps		%%ymm12,%%ymm15			\n\t"\
	"subq		%%r8,%%rax			\n\t"\
	"subq		%%r8,%%rbx			\n\t"\
	"subq		%%r8,%%rcx			\n\t"\
	"subq		%%r8,%%rdx			\n\t"\
	"movq	0x30(%%r14),%%rdi	/* c4 */\n\t												movq	0x50(%%r14),%%r10		\n\t"/* c6 */\
	"movq	0x40(%%r14),%%r8 	/* c5 */\n\t												movq	0x60(%%r14),%%r12		\n\t"/* c7 */\
	"movq	0x38(%%r14),%%rsi	/* s4 */\n\t												movq	0x58(%%r14),%%r11		\n\t"/* s6 */\
	"movq	0x48(%%r14),%%r9 	/* s5 */\n\t												movq	0x68(%%r14),%%r13		\n\t"/* s7 */\
	"vmulpd		(%%rdi),%%ymm0 ,%%ymm0 	\n\t	vmulpd		(%%rdi),%%ymm1 ,%%ymm1 	\n\t	vmulpd		(%%r10),%%ymm8 ,%%ymm8 	\n\t	vmulpd		(%%r10),%%ymm9 ,%%ymm9 	\n\t"\
	"vmulpd		(%%r8 ),%%ymm4 ,%%ymm4 	\n\t	vmulpd		(%%r8 ),%%ymm5 ,%%ymm5 	\n\t	vmulpd		(%%r12),%%ymm12,%%ymm12	\n\t	vmulpd		(%%r12),%%ymm13,%%ymm13	\n\t"\
	"vmulpd		(%%r9 ),%%ymm6 ,%%ymm6 	\n\t	vmulpd		(%%r9 ),%%ymm7 ,%%ymm7 	\n\t	vmulpd		(%%r13),%%ymm14,%%ymm14	\n\t	vmulpd		(%%r13),%%ymm15,%%ymm15	\n\t"\
	"vmulpd		(%%rsi),%%ymm2 ,%%ymm2 	\n\t	vmulpd		(%%rsi),%%ymm3 ,%%ymm3 	\n\t	vmulpd		(%%r11),%%ymm10,%%ymm10	\n\t	vmulpd		(%%r11),%%ymm11,%%ymm11	\n\t"\
	"vaddpd		%%ymm2 ,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm3 ,%%ymm1 ,%%ymm1 	\n\t	vaddpd		%%ymm10,%%ymm8 ,%%ymm8 	\n\t	vsubpd		%%ymm11,%%ymm9 ,%%ymm9 	\n\t"\
	"vaddpd		%%ymm6 ,%%ymm4 ,%%ymm4 	\n\t	vsubpd		%%ymm7 ,%%ymm5 ,%%ymm5 	\n\t	vaddpd		%%ymm14,%%ymm12,%%ymm12	\n\t	vsubpd		%%ymm15,%%ymm13,%%ymm13	\n\t"\
	/* Now do radix-2 butterfly: */\
	"vmovaps	%%ymm0 ,%%ymm2 			\n\t	vmovaps		%%ymm1 ,%%ymm3 			\n\t	vmovaps		%%ymm8 ,%%ymm10			\n\t	vmovaps		%%ymm9 ,%%ymm11			\n\t"\
	"vaddpd		%%ymm4 ,%%ymm0 ,%%ymm0 	\n\t	vaddpd		%%ymm5 ,%%ymm1 ,%%ymm1 	\n\t	vaddpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vaddpd		%%ymm13,%%ymm9 ,%%ymm9 	\n\t"\
	"vsubpd		%%ymm4 ,%%ymm2 ,%%ymm2 	\n\t	vsubpd		%%ymm5 ,%%ymm3 ,%%ymm3 	\n\t	vsubpd		%%ymm12,%%ymm10,%%ymm10	\n\t	vsubpd		%%ymm13,%%ymm11,%%ymm11	\n\t"\
	/* Reload Block 0-3 outputs into r4-7,c-f, combine to get the 2 length-4 subtransform... */\
	"vmovaps		(%%rax),%%ymm4 		\n\t	vmovaps		0x20(%%rax),%%ymm5 		\n\t"\
	"vmovaps		(%%rbx),%%ymm6 		\n\t	vmovaps		0x20(%%rbx),%%ymm7 		\n\t"\
	"vmovaps		(%%rcx),%%ymm12		\n\t	vmovaps		0x20(%%rcx),%%ymm13		\n\t"\
	"vmovaps		(%%rdx),%%ymm14		\n\t	vmovaps		0x20(%%rdx),%%ymm15		\n\t"\
	"movq		%[out0],%%rax			\n\t	movq		%[o_off],%%r8		\n\t"/* out0, off1 */\
	"movq		%[two],%%rsi			\n\t	leaq		(%%r8,%%r8),%%r9	\n\t"/* (vec_dbl)2.0, off2 */\
		"										leaq		(%%r9,%%r9),%%r10	\n\t"/* off4 */\
	"vsubpd		%%ymm12,%%ymm4 ,%%ymm4 	\n\t	vsubpd		%%ymm13,%%ymm5 ,%%ymm5 	\n\t"\
	"vsubpd		%%ymm15,%%ymm6 ,%%ymm6 	\n\t	vsubpd		%%ymm14,%%ymm7 ,%%ymm7 	\n\t"\
	"vsubpd		%%ymm8 ,%%ymm0 ,%%ymm0 	\n\t	vsubpd		%%ymm9 ,%%ymm1 ,%%ymm1 	\n\t"\
	"vsubpd		%%ymm11,%%ymm2 ,%%ymm2 	\n\t	vsubpd		%%ymm10,%%ymm3 ,%%ymm3 	\n\t"\
	/* We hope the microcode execution engine inlines the MULs with the above SUBs: */\
	"vmovaps	%%ymm10,(%%rdx) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%rsi),%%ymm10 	\n\t"/* two */\
	"vmulpd		%%ymm10,%%ymm12,%%ymm12	\n\t	vmulpd		%%ymm10,%%ymm13,%%ymm13	\n\t"\
	"vmulpd		%%ymm10,%%ymm15,%%ymm15	\n\t	vmulpd		%%ymm10,%%ymm14,%%ymm14	\n\t"\
	"vmulpd		%%ymm10,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm10,%%ymm9 ,%%ymm9 	\n\t"\
	"vmulpd		%%ymm10,%%ymm11,%%ymm11	\n\t	vmulpd		(%%rdx),%%ymm10,%%ymm10	\n\t"\
	"vaddpd		%%ymm4 ,%%ymm12,%%ymm12	\n\t	vaddpd		%%ymm5 ,%%ymm13,%%ymm13	\n\t"\
	"vaddpd		%%ymm6 ,%%ymm15,%%ymm15	\n\t	vaddpd		%%ymm7 ,%%ymm14,%%ymm14	\n\t"\
	"vaddpd		%%ymm0 ,%%ymm8 ,%%ymm8 	\n\t	vaddpd		%%ymm1 ,%%ymm9 ,%%ymm9 	\n\t"\
	"vaddpd		%%ymm2 ,%%ymm11,%%ymm11	\n\t	vaddpd		%%ymm3 ,%%ymm10,%%ymm10	\n\t"\
	/* In terms of our original scalar-code prototyping macro, the data are: __tr0 = _r[c,f,4,6,8,b,0,2], __ti0 = _r[d,7,5,e,9,3,1,a]; */\
	/* Now combine the two half-transforms: */\
	/* Need r2/3+- a/b combos for the *ISRT2 preceding the output 4-7 radix-2 butterflies, so start them first: */\
	"vsubpd		%%ymm3 ,%%ymm11,%%ymm11	\n\t	vsubpd		%%ymm10,%%ymm2 ,%%ymm2 	\n\t"\
	"vsubpd		%%ymm8 ,%%ymm12,%%ymm12	\n\t	vsubpd		%%ymm9 ,%%ymm13,%%ymm13	\n\t"\
	"vsubpd		%%ymm1 ,%%ymm4 ,%%ymm4 	\n\t	vsubpd		%%ymm0 ,%%ymm5 ,%%ymm5 	\n\t"\
	"vmovaps	%%ymm0 ,(%%rdx) 	\n\t"/* spill ymm14 to make room for 2.0 */"	vmovaps	(%%rsi),%%ymm0  	\n\t"/* two */\
	"vmulpd		%%ymm0 ,%%ymm3 ,%%ymm3 	\n\t	vmulpd		%%ymm0 ,%%ymm10,%%ymm10	\n\t"\
	"vmulpd		%%ymm0 ,%%ymm8 ,%%ymm8 	\n\t	vmulpd		%%ymm0 ,%%ymm9 ,%%ymm9 	\n\t"\
	"vmulpd		%%ymm0 ,%%ymm1 ,%%ymm1 	\n\t	vmulpd		(%%rdx),%%ymm0 ,%%ymm0 	\n\t"\
	"vaddpd		%%ymm11,%%ymm3 ,%%ymm3 	\n\t	vaddpd		%%ymm2 ,%%ymm10,%%ymm10	\n\t"\
	"vaddpd		%%ymm12,%%ymm8 ,%%ymm8 	\n\t	vaddpd		%%ymm13,%%ymm9 ,%%ymm9 	\n\t"\
	"vaddpd		%%ymm4 ,%%ymm1 ,%%ymm1 	\n\t	vaddpd		%%ymm5 ,%%ymm0 ,%%ymm0 	\n\t"\
	/*movq		%[o0],%%rax		[o0] already in rax */	\
	"leaq	(%%rax,%%r9 ),%%rcx		\n\t"/* out0 + off2, compute first to allow time for LEA to finish before += off4 to get out0 + off6 */\
	"leaq	(%%rax,%%r10),%%rbx		\n\t"/* out0 + off4 */\
	"leaq	(%%rcx,%%r10),%%rdx		\n\t"/* out0 + off6 */\
	"vmovaps	%%ymm12,    (%%rbx)		\n\t	vmovaps		%%ymm13,0x20(%%rbx)		\n\t"/* __Br1 = _rc;	__Bi1 = _rd; */\
	/* Use that _rc,d free to stick 2.0 into _rc and that [c4] in rdi to load ISRT2 from c4-1 into _rd: */\
	"vmovaps		(%%rsi),%%ymm12		\n\t	vmovaps		-0x20(%%rdi),%%ymm13	\n\t"/* _rc = 2.0;		_rd = ISRT2; */\
	"vmovaps	%%ymm4 ,    (%%rdx)		\n\t	vmovaps		%%ymm0 ,0x20(%%rdx)		\n\t"/* __Br3 = _r4;	__Bi3 = _r0; */\
	"vmovaps	%%ymm8 ,    (%%rax)		\n\t	vmovaps		%%ymm9 ,0x20(%%rax)		\n\t"/* __Br0 = _r8;	__Bi0 = _r9; */\
	"vmovaps	%%ymm1 ,    (%%rcx)		\n\t	vmovaps		%%ymm5 ,0x20(%%rcx)		\n\t"/* __Br2 = _r1;	__Bi2 = _r5; */\
	"vmulpd		%%ymm13,%%ymm3 ,%%ymm3 	\n\t	vmulpd		%%ymm13,%%ymm11,%%ymm11	\n\t"\
	"vmulpd		%%ymm13,%%ymm2 ,%%ymm2 	\n\t	vmulpd		%%ymm13,%%ymm10,%%ymm10	\n\t"\
	"vsubpd		%%ymm3 ,%%ymm15,%%ymm15	\n\t	vsubpd		%%ymm11,%%ymm7 ,%%ymm7 	\n\t"\
	"vsubpd		%%ymm2 ,%%ymm6 ,%%ymm6 	\n\t	vsubpd		%%ymm10,%%ymm14,%%ymm14	\n\t"\
	"vmulpd		%%ymm12,%%ymm3 ,%%ymm3 	\n\t	vmulpd		%%ymm12,%%ymm11,%%ymm11	\n\t"\
	"vmulpd		%%ymm12,%%ymm2 ,%%ymm2 	\n\t	vmulpd		%%ymm12,%%ymm10,%%ymm10	\n\t"\
	"vaddpd		%%ymm15,%%ymm3 ,%%ymm3 	\n\t	vaddpd		%%ymm7 ,%%ymm11,%%ymm11	\n\t"\
	"vaddpd		%%ymm6 ,%%ymm2 ,%%ymm2 	\n\t	vaddpd		%%ymm14,%%ymm10,%%ymm10	\n\t"\
	"addq		%%r8 ,%%rax			\n\t"/* out0 + off[1,5,3,7] */\
	"addq		%%r8 ,%%rbx			\n\t"\
	"addq		%%r8 ,%%rcx			\n\t"\
	"addq		%%r8 ,%%rdx			\n\t"\
	"vmovaps	%%ymm3 ,    (%%rax)		\n\t	vmovaps		%%ymm7 ,0x20(%%rax)		\n\t"/* __Br4 = _r3;	__Bi4 = _r7; */\
	"vmovaps	%%ymm15,    (%%rbx)		\n\t	vmovaps		%%ymm11,0x20(%%rbx)		\n\t"/* __Br5 = _rf;	__Bi5 = _rb; */\
	"vmovaps	%%ymm6 ,    (%%rcx)		\n\t	vmovaps		%%ymm14,0x20(%%rcx)		\n\t"/* __Br6 = _r6;	__Bi6 = _re; */\
	"vmovaps	%%ymm2 ,    (%%rdx)		\n\t	vmovaps		%%ymm10,0x20(%%rdx)		\n\t"/* __Br7 = _r2;	__Bi7 = _ra; */\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "e" (Xi1)	/* ...except for 'e'-inputs which are literal byte offsets */\
		 ,[out0] "m" (Xout0)\
		 ,[o_off] "m" (Xo_off)/* O-address pointer-stride */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

/*** Prefetch odd-index iaddresses in DIF below, even-index oaddresses in SSE2_RADIX16_DIF_TWIDDLE_OOP ***/

	// Based on the SSE2_RADIX16_DIF_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-output addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	/* Dec 2020: Needed to cut #args for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid
	multiple versions of the macro having different arglists. Replace 16 O-addresses with O-base-address
	out0 and pointer to array of 16 int offset-indices: */
	#define SSE2_RADIX16_DIF_0TWIDDLE(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0,Xoff)\
	{\
	__asm__ volatile (\
	/* SSE2_RADIX4_DIF_IN_PLACE(r1 , r17, r9 , r25): */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rcx	\n\t"/* __in0 +   [4*istride]; note BR of [a,b,c,d]-ptrs, i.e. b/c swap */\
		"leaq	%c[__i4](%%rcx),%%rbx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"vmovaps	    (%%rax),%%ymm0		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t"\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6		\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7		\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
	/* SSE2_RADIX4_DIF_IN_PLACE(r5 , r21, r13, r29): */\
		"addq	$%c[__i2],%%rax	\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__i2],%%rbx	\n\t"\
		"addq	$%c[__i2],%%rcx	\n\t"\
		"addq	$%c[__i2],%%rdx	\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t"\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)		\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
	/* SSE2_RADIX4_DIF_IN_PLACE(r3 , r19, r11, r27): */\
		"subq	$%c[__i1],%%rax	\n\t"/* All addresses -= 1*ostride */\
		"subq	$%c[__i1],%%rbx	\n\t"\
		"subq	$%c[__i1],%%rcx	\n\t"\
		"subq	$%c[__i1],%%rdx	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1			\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4			\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5			\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
	/* SSE2_RADIX4_DIF_IN_PLACE(r7 , r23, r15, r31): */\
		"addq	$%c[__i2],%%rax	\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__i2],%%rbx	\n\t"\
		"addq	$%c[__i2],%%rcx	\n\t"\
		"addq	$%c[__i2],%%rdx	\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t"\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
	/*****************************************************************************************
	**** Now do 4 DFTs with internal twiddles on the 1*stride - separated data. Do blocks ****
	**** in order 0,2,1,3 to allow increment-only of rsi-datum from 1 block to the next:  ****
	*****************************************************************************************/\
	"movq	%[__in0],%%rsi	\n\t"\
	"movq	%[out0],%%r8	\n\t	movq	%[off],%%r9	\n\t"/* Load output base-address into r8 and offset-array pointer into r9 */\
	/* Block 0: r0-3 */\
		"movslq		    (%%r9),%%rax	\n\t"/*        off0 */\
		"movslq		0x04(%%r9),%%rbx	\n\t"/*        off1 */\
		"movslq		0x08(%%r9),%%rcx	\n\t"/*        off2 */\
		"movslq		0x0c(%%r9),%%rdx	\n\t"/*        off3 */\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* out0 + off0 */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"/* out0 + off1 */\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"/* out0 + off2 */\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"/* out0 + off3 */\
		"leaq	0x20(%%rsi),%%rdi	\n\t"/* Need separate address Im parts of outputs due to literal-offsets below */\
		"vmovaps	        (%%rsi),%%ymm0	\n\t"\
		"vmovaps	        (%%rdi),%%ymm1	\n\t"\
		"vmovaps	%c[__i2](%%rsi),%%ymm2	\n\t"\
		"vmovaps	%c[__i2](%%rdi),%%ymm3	\n\t"\
		"vsubpd	%c[__i2](%%rsi),%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%c[__i2](%%rdi),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	        (%%rsi),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	        (%%rdi),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%c[__i1](%%rsi),%%ymm4	\n\t"\
		"vmovaps	%c[__i1](%%rdi),%%ymm5	\n\t"\
		"vmovaps	%c[__i3](%%rsi),%%ymm6	\n\t"\
		"vmovaps	%c[__i3](%%rdi),%%ymm7	\n\t"\
		"vsubpd	%c[__i3](%%rsi),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	%c[__i3](%%rdi),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%c[__i1](%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%c[__i1](%%rdi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rcx)	\n\t"\
	/* Block 2: */\
		"movslq		0x20(%%r9),%%rax	\n\t"/* off8-b */\
		"movslq		0x24(%%r9),%%rbx	\n\t"\
		"movslq		0x28(%%r9),%%rcx	\n\t"\
		"movslq		0x2c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__i4],%%rsi	\n\t"/* __in0 + 4*ostride */\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	%c[__i1](%%rsi),%%ymm4	\n\t"\
		"vmovaps	%c[__i3](%%rsi),%%ymm6	\n\t"\
		"vmovaps	%c[__i1](%%rdi),%%ymm5	\n\t"\
		"vmovaps	%c[__i3](%%rdi),%%ymm7	\n\t"\
		"vmovaps	%%ymm4,%%ymm0		\n\t"\
		"vmovaps	%%ymm6,%%ymm2		\n\t"\
		"vmovaps	%%ymm5,%%ymm1		\n\t"\
		"vmovaps	%%ymm7,%%ymm3		\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"addq	$0x20,%%rdi	\n\t"/* cc0 */\
		"vmulpd	    (%%rdi),%%ymm4,%%ymm4	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	    (%%rdi),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	    (%%rdi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm7,%%ymm7	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm0,%%ymm0	\n\t"\
		"vmulpd	    (%%rdi),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%%ymm1,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	%c[__i2](%%rsi),%%ymm2	\n\t"\
		"vmovaps	%c[__i2](%%rdi),%%ymm3	\n\t"\
		"vsubpd	%c[__i2](%%rdi),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	%c[__i2](%%rsi),%%ymm3,%%ymm3	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"vmulpd	(%%rdi),%%ymm2,%%ymm2	\n\t"/* mul by isrt2 */\
		"vmulpd	(%%rdi),%%ymm3,%%ymm3	\n\t"\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	        (%%rsi),%%ymm0	\n\t"\
		"vmovaps	        (%%rdi),%%ymm1	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	        (%%rsi),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	        (%%rdi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rcx)	\n\t"\
	/* Block 1: r8-b */\
		"movslq		0x10(%%r9),%%rax	\n\t"/* off4-7 */\
		"movslq		0x14(%%r9),%%rbx	\n\t"\
		"movslq		0x18(%%r9),%%rcx	\n\t"\
		"movslq		0x1c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__i4],%%rsi	\n\t"/* __in0 + 8*ostride */\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	        (%%rsi),%%ymm0	\n\t"\
		"vmovaps	        (%%rdi),%%ymm1	\n\t"\
		"vmovaps	%c[__i2](%%rsi),%%ymm2	\n\t"\
		"vmovaps	%c[__i2](%%rdi),%%ymm3	\n\t"\
		"vsubpd	%c[__i2](%%rdi),%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%c[__i2](%%rsi),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	        (%%rdi),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	        (%%rsi),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	%c[__i1](%%rsi),%%ymm4	\n\t"\
		"vmovaps	%c[__i1](%%rdi),%%ymm5	\n\t"\
		"vmovaps	%c[__i3](%%rsi),%%ymm6	\n\t"\
		"vmovaps	%c[__i3](%%rdi),%%ymm7	\n\t"\
		"vsubpd	%c[__i1](%%rdi),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%c[__i1](%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%c[__i3](%%rdi),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	%c[__i3](%%rsi),%%ymm7,%%ymm7	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"vmulpd	(%%rdi),%%ymm4,%%ymm4		\n\t"\
		"vmulpd	(%%rdi),%%ymm5,%%ymm5		\n\t"\
		"vmulpd	(%%rdi),%%ymm6,%%ymm6		\n\t"\
		"vmulpd	(%%rdi),%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm2,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm6,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm3,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm1,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
	/* Block 3: */\
		"movslq		0x30(%%r9),%%rax	\n\t"/* offc-f */\
		"movslq		0x34(%%r9),%%rbx	\n\t"\
		"movslq		0x38(%%r9),%%rcx	\n\t"\
		"movslq		0x3c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__i4],%%rsi	\n\t"/* __in0 + c*ostride */\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	%c[__i1](%%rsi),%%ymm4	\n\t"\
		"vmovaps	%c[__i3](%%rsi),%%ymm6	\n\t"\
		"vmovaps	%c[__i1](%%rdi),%%ymm5	\n\t"\
		"vmovaps	%c[__i3](%%rdi),%%ymm7	\n\t"\
		"vmovaps	%%ymm4,%%ymm0		\n\t"\
		"vmovaps	%%ymm6,%%ymm2		\n\t"\
		"vmovaps	%%ymm5,%%ymm1		\n\t"\
		"vmovaps	%%ymm7,%%ymm3		\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"addq	$0x20,%%rdi	\n\t"/* cc0 */\
		"vmulpd	0x20(%%rdi),%%ymm4,%%ymm4	\n\t"\
		"vmulpd	    (%%rdi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	    (%%rdi),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	    (%%rdi),%%ymm7,%%ymm7	\n\t"\
		"vmulpd	    (%%rdi),%%ymm0,%%ymm0	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%%ymm1,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	%c[__i2](%%rsi),%%ymm2	\n\t"\
		"vmovaps	%c[__i2](%%rdi),%%ymm3	\n\t"\
		"vaddpd	%c[__i2](%%rdi),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%c[__i2](%%rsi),%%ymm3,%%ymm3	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"vmulpd	(%%rdi),%%ymm2,%%ymm2	\n\t"/* mul by isrt2 */\
		"vmulpd	(%%rdi),%%ymm3,%%ymm3	\n\t"\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	        (%%rsi),%%ymm0	\n\t"\
		"vmovaps	        (%%rdi),%%ymm1	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	        (%%rsi),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	        (%%rdi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "e" (Xi1)\
		,[__i2] "e" (Xi2)\
		,[__i3] "e" (Xi3)\
		,[__i4] "e" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"	/* Clobbered registers */\
	);\
	}

	// Same as above, but with specifiable I-addresses and regularly spaced O-addresses:

   #if 0	// 16-register version: This proves slower than 8-reg in both my SSE2/Core2 ad AVX/Haswell tests:

	#define SSE2_RADIX16_DIF_0TWIDDLE_B(Xin0,Xin1,Xin2,Xin3,Xin4,Xin5,Xin6,Xin7,Xin8,Xin9,Xina,Xinb,Xinc,Xind,Xine,Xinf, Xisrt2,Xtwo, Xout0,Xout1,Xout2,Xout3)\
	{\
	__asm__ volatile (\
	/* SSE2_RADIX4_DIF_IN_PLACE(r1,r17,r9,r25): */	/* SSE2_RADIX4_DIF_IN_PLACE(r3,r19,r11,r27): */\
		"movq	%[__in0],%%rax				\n\t		movq	%[__in1],%%r10		\n\t"\
		"movq	%[__in8],%%rbx				\n\t		movq	%[__in9],%%r11		\n\t"\
		"movq	%[__in4],%%rcx				\n\t		movq	%[__in5],%%r12		\n\t"\
		"movq	%[__inc],%%rdx				\n\t		movq	%[__ind],%%r13		\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t		vmovaps	    (%%r10),%%ymm8	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t		vmovaps	0x20(%%r10),%%ymm9	\n\t"\
		"vmovaps	    (%%rax),%%ymm2		\n\t		vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3		\n\t		vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t		vaddpd	    (%%r11),%%ymm8 ,%%ymm8	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t		vaddpd	0x20(%%r11),%%ymm9 ,%%ymm9	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t		vsubpd	    (%%r11),%%ymm10,%%ymm10	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t		vsubpd	0x20(%%r11),%%ymm11,%%ymm11	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4		\n\t		vmovaps	    (%%r12),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5		\n\t		vmovaps	0x20(%%r12),%%ymm13	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6		\n\t		vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7		\n\t		vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t		vaddpd	    (%%r13),%%ymm12,%%ymm12	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t		vaddpd	0x20(%%r13),%%ymm13,%%ymm13	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t		vsubpd	    (%%r13),%%ymm14,%%ymm14	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t		vsubpd	0x20(%%r13),%%ymm15,%%ymm15	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t		vsubpd	%%ymm12,%%ymm8,%%ymm8		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t		vsubpd	%%ymm13,%%ymm9,%%ymm9		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)		\n\t		vmovaps	%%ymm8,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)		\n\t		vmovaps	%%ymm9,0x20(%%r11)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm12,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t		vaddpd	%%ymm13,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm8 ,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t		vaddpd	%%ymm9 ,%%ymm13,%%ymm13		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)		\n\t		vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)		\n\t		vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t		vsubpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t		vsubpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)		\n\t		vmovaps	%%ymm10,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)		\n\t		vmovaps	%%ymm11,0x20(%%r13)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t		vaddpd	%%ymm10,%%ymm15,%%ymm15		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t		vaddpd	%%ymm11,%%ymm14,%%ymm14		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)		\n\t		vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)		\n\t		vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
	/* SSE2_RADIX4_DIF_IN_PLACE(r5,r21,r13,r29): */	/* SSE2_RADIX4_DIF_IN_PLACE(r7,r23,r15,r31): */\
		"movq	%[__in2],%%rax				\n\t		movq	%[__in3],%%r10		\n\t"\
		"movq	%[__ina],%%rbx				\n\t		movq	%[__inb],%%r11		\n\t"\
		"movq	%[__in6],%%rcx				\n\t		movq	%[__in7],%%r12		\n\t"\
		"movq	%[__ine],%%rdx				\n\t		movq	%[__inf],%%r13		\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t		vmovaps	    (%%r10),%%ymm8	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t		vmovaps	0x20(%%r10),%%ymm9	\n\t"\
		"vmovaps	    (%%rax),%%ymm2		\n\t		vmovaps	    (%%r10),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3		\n\t		vmovaps	0x20(%%r10),%%ymm11	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t		vaddpd	    (%%r11),%%ymm8 ,%%ymm8	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t		vaddpd	0x20(%%r11),%%ymm9 ,%%ymm9	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t		vsubpd	    (%%r11),%%ymm10,%%ymm10	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t		vsubpd	0x20(%%r11),%%ymm11,%%ymm11	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4		\n\t		vmovaps	    (%%r12),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5		\n\t		vmovaps	0x20(%%r12),%%ymm13	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6		\n\t		vmovaps	    (%%r12),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7		\n\t		vmovaps	0x20(%%r12),%%ymm15	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t		vaddpd	    (%%r13),%%ymm12,%%ymm12	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t		vaddpd	0x20(%%r13),%%ymm13,%%ymm13	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t		vsubpd	    (%%r13),%%ymm14,%%ymm14	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t		vsubpd	0x20(%%r13),%%ymm15,%%ymm15	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t		vsubpd	%%ymm12,%%ymm8,%%ymm8		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t		vsubpd	%%ymm13,%%ymm9,%%ymm9		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)		\n\t		vmovaps	%%ymm8,    (%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)		\n\t		vmovaps	%%ymm9,0x20(%%r11)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm12,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t		vaddpd	%%ymm13,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm8 ,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t		vaddpd	%%ymm9 ,%%ymm13,%%ymm13		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)		\n\t		vmovaps	%%ymm12,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)		\n\t		vmovaps	%%ymm13,0x20(%%r10)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t		vsubpd	%%ymm15,%%ymm10,%%ymm10		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t		vsubpd	%%ymm14,%%ymm11,%%ymm11		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)		\n\t		vmovaps	%%ymm10,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)		\n\t		vmovaps	%%ymm11,0x20(%%r13)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t		vaddpd	%%ymm10,%%ymm15,%%ymm15		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t		vaddpd	%%ymm11,%%ymm14,%%ymm14		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)		\n\t		vmovaps	%%ymm15,    (%%r13)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)		\n\t		vmovaps	%%ymm14,0x20(%%r12)	\n\t"\
	/*** Now do 4 DFTs with internal twiddles on the 1*stride - separated data. Do blocks in order 0,2,1,3 to allow increment-only of rsi-datum from 1 block to the next: ***/\
		"movq	%[__isrt2],%%rdi			\n\t"\
		/* Block 0: r0-3 */								/* Block 1: r8-b */\
		"movq	%[__out0],%%rsi				\n\t		movq	%[__out1],%%r8	\n\t"\
		"movq	%[__in0],%%rax				\n\t		movq	%[__in8],%%r10		\n\t"\
		"movq	%[__in2],%%rbx				\n\t		movq	%[__ina],%%r11		\n\t"\
		"movq	%[__in1],%%rcx				\n\t		movq	%[__in9],%%r12		\n\t"\
		"movq	%[__in3],%%rdx				\n\t		movq	%[__inb],%%r13		\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t		vmovaps	    (%%r10),%%ymm8	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t		vmovaps	0x20(%%r10),%%ymm9	\n\t"\
		"vmovaps		(%%rbx),%%ymm2		\n\t		vmovaps		(%%r11),%%ymm10	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3		\n\t		vmovaps	0x20(%%r11),%%ymm11	\n\t"\
		"vsubpd		(%%rbx),%%ymm0,%%ymm0	\n\t		vsubpd	0x20(%%r11),%%ymm8 ,%%ymm8	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t		vsubpd		(%%r11),%%ymm9 ,%%ymm9	\n\t"\
		"vaddpd	    (%%rax),%%ymm2,%%ymm2	\n\t		vaddpd	0x20(%%r10),%%ymm10,%%ymm10	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm3,%%ymm3	\n\t		vaddpd		(%%r10),%%ymm11,%%ymm11	\n\t"\
		"vmovaps		(%%rcx),%%ymm4		\n\t		vmovaps		(%%r12),%%ymm12	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5		\n\t		vmovaps	0x20(%%r12),%%ymm13	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6		\n\t		vmovaps	    (%%r13),%%ymm14	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7		\n\t		vmovaps	0x20(%%r13),%%ymm15	\n\t"\
		"vsubpd		(%%rdx),%%ymm4,%%ymm4	\n\t		vsubpd	0x20(%%r12),%%ymm12,%%ymm12	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t		vaddpd		(%%r12),%%ymm13,%%ymm13	\n\t"\
		"vaddpd	    (%%rcx),%%ymm6,%%ymm6	\n\t		vaddpd	0x20(%%r13),%%ymm14,%%ymm14	\n\t"\
		"vaddpd	0x20(%%rcx),%%ymm7,%%ymm7	\n\t		vsubpd		(%%r13),%%ymm15,%%ymm15	\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t		vmulpd	(%%rdi),%%ymm12,%%ymm12		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t		vmulpd	(%%rdi),%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t		vmulpd	(%%rdi),%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t		vmulpd	(%%rdi),%%ymm15,%%ymm15		\n\t"\
		"vmovaps	%%ymm2,0x40(%%rsi)		\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12		\n\t"\
		"vmovaps	%%ymm3,0x60(%%rsi)		\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13		\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6		\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14		\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7		\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15		\n\t"\
		"vmovaps	%%ymm6,    (%%rsi)		\n\t		vaddpd	%%ymm12,%%ymm14,%%ymm14		\n\t"\
		"vmovaps	%%ymm7,0x20(%%rsi)		\n\t		vaddpd	%%ymm13,%%ymm15,%%ymm15		\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0		\n\t		vsubpd	%%ymm12,%%ymm8 ,%%ymm8		\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1		\n\t		vsubpd	%%ymm13,%%ymm10,%%ymm10		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t		vaddpd	%%ymm12,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm13,%%ymm13,%%ymm13		\n\t"\
		"vmovaps	%%ymm0,0x80(%%rsi)		\n\t		vmovaps	%%ymm8 ,0x40(%%r8)	\n\t"\
		"vmovaps	%%ymm1,0xe0(%%rsi)		\n\t		vmovaps	%%ymm10,0x60(%%r8)	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5		\n\t		vaddpd	%%ymm8 ,%%ymm12,%%ymm12	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm10,%%ymm13,%%ymm13	\n\t"\
		"vmovaps	%%ymm5,0xc0(%%rsi)		\n\t		vmovaps	%%ymm12,    (%%r8)	\n\t"\
		"vmovaps	%%ymm4,0xa0(%%rsi)		\n\t		vmovaps	%%ymm13,0x20(%%r8)	\n\t"\
		/* Block 2: */								"	vsubpd	%%ymm15,%%ymm11,%%ymm11	\n\t"\
		"movq	%[__out2],%%rsi				\n\t		vsubpd	%%ymm14,%%ymm9 ,%%ymm9	\n\t"\
		"movq	%[__in4],%%rax				\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15	\n\t"\
		"movq	%[__in6],%%rbx				\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14	\n\t"\
		"movq	%[__in5],%%rcx				\n\t		vmovaps	%%ymm11,0x80(%%r8)	\n\t"\
		"movq	%[__in7],%%rdx				\n\t		vmovaps	%%ymm9 ,0xe0(%%r8)	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4		\n\t		vaddpd	%%ymm11,%%ymm15,%%ymm15	\n\t"\
		"vmovaps		(%%rdx),%%ymm6		\n\t		vaddpd	%%ymm9 ,%%ymm14,%%ymm14	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5		\n\t		vmovaps	%%ymm15,0xc0(%%r8)	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7		\n\t		vmovaps	%%ymm14,0xa0(%%r8)	\n\t"\
		"vmovaps	%%ymm4,%%ymm0			\n\t	"	/* Block 3: */\
		"vmovaps	%%ymm6,%%ymm2			\n\t		movq	%[__out3],%%r8	\n\t"\
		"vmovaps	%%ymm5,%%ymm1			\n\t		movq	%[__inc],%%r10		\n\t"\
		"vmovaps	%%ymm7,%%ymm3			\n\t		movq	%[__ine],%%r11		\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm4,%%ymm4	\n\t		movq	%[__ind],%%r12		\n\t"\
		"vmulpd	0x40(%%rdi),%%ymm6,%%ymm6	\n\t		movq	%[__inf],%%r13		\n\t"\
		"vmulpd	0x40(%%rdi),%%ymm1,%%ymm1	\n\t		vmovaps	    (%%r12),%%ymm12	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm3,%%ymm3	\n\t		vmovaps		(%%r13),%%ymm14	\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm5,%%ymm5	\n\t		vmovaps	0x20(%%r12),%%ymm13	\n\t"\
		"vmulpd	0x40(%%rdi),%%ymm7,%%ymm7	\n\t		vmovaps	0x20(%%r13),%%ymm15	\n\t"\
		"vmulpd	0x40(%%rdi),%%ymm0,%%ymm0	\n\t		vmovaps	%%ymm12,%%ymm8		\n\t"\
		"vmulpd	0x20(%%rdi),%%ymm2,%%ymm2	\n\t		vmovaps	%%ymm14,%%ymm10		\n\t"\
		"vsubpd	%%ymm1,%%ymm4,%%ymm4		\n\t		vmovaps	%%ymm13,%%ymm9		\n\t"\
		"vsubpd	%%ymm3,%%ymm6,%%ymm6		\n\t		vmovaps	%%ymm15,%%ymm11		\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5		\n\t		vmulpd	0x40(%%rdi),%%ymm12,%%ymm12	\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t		vmulpd	0x20(%%rdi),%%ymm14,%%ymm14	\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t		vmulpd	0x20(%%rdi),%%ymm9 ,%%ymm9	\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t		vmulpd	0x40(%%rdi),%%ymm11,%%ymm11	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t		vmulpd	0x40(%%rdi),%%ymm13,%%ymm13	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t		vmulpd	0x20(%%rdi),%%ymm15,%%ymm15	\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t		vmulpd	0x20(%%rdi),%%ymm8 ,%%ymm8	\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t		vmulpd	0x40(%%rdi),%%ymm10,%%ymm10	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2		\n\t		vsubpd	%%ymm9 ,%%ymm12,%%ymm12	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3		\n\t		vsubpd	%%ymm11,%%ymm14,%%ymm14	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm2,%%ymm2	\n\t		vaddpd	%%ymm8 ,%%ymm13,%%ymm13	\n\t"\
		"vaddpd		(%%rbx),%%ymm3,%%ymm3	\n\t		vaddpd	%%ymm10,%%ymm15,%%ymm15	\n\t"\
		"vmulpd	(%%rdi),%%ymm2,%%ymm2		\n\t		vsubpd	%%ymm14,%%ymm12,%%ymm12	\n\t"\
		"vmulpd	(%%rdi),%%ymm3,%%ymm3		\n\t		vsubpd	%%ymm15,%%ymm13,%%ymm13	\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t		vaddpd	%%ymm14,%%ymm14,%%ymm14	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t		vaddpd	%%ymm15,%%ymm15,%%ymm15	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t		vaddpd	%%ymm12,%%ymm14,%%ymm14	\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t		vaddpd	%%ymm13,%%ymm15,%%ymm15	\n\t"\
		"vaddpd	    (%%rax),%%ymm2,%%ymm2	\n\t		vmovaps	    (%%r11),%%ymm10	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm3,%%ymm3	\n\t		vmovaps	0x20(%%r11),%%ymm11	\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t		vaddpd	0x20(%%r11),%%ymm10,%%ymm10	\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t		vsubpd	    (%%r11),%%ymm11,%%ymm11	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t		vmulpd	(%%rdi),%%ymm10,%%ymm10	\n\t"/* mul by isrt2 */\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t		vmulpd	(%%rdi),%%ymm11,%%ymm11	\n\t"\
		"vmovaps	%%ymm2,0x40(%%rsi)		\n\t		vmovaps	    (%%r10),%%ymm8	\n\t"\
		"vmovaps	%%ymm3,0x60(%%rsi)		\n\t		vmovaps	0x20(%%r10),%%ymm9	\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6		\n\t		vsubpd	%%ymm10,%%ymm8,%%ymm8		\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7		\n\t		vsubpd	%%ymm11,%%ymm9,%%ymm9		\n\t"\
		"vmovaps	%%ymm6,    (%%rsi)		\n\t		vaddpd	    (%%r10),%%ymm10,%%ymm10	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rsi)		\n\t		vaddpd	0x20(%%r10),%%ymm11,%%ymm11	\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0		\n\t		vsubpd	%%ymm12,%%ymm8 ,%%ymm8		\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1		\n\t		vsubpd	%%ymm13,%%ymm9 ,%%ymm9		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t		vaddpd	%%ymm12,%%ymm12,%%ymm12		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm13,%%ymm13,%%ymm13		\n\t"\
		"vmovaps	%%ymm0,0x80(%%rsi)		\n\t		vmovaps	%%ymm8,0x40(%%r8)	\n\t"\
		"vmovaps	%%ymm1,0xe0(%%rsi)		\n\t		vmovaps	%%ymm9,0x60(%%r8)	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5		\n\t		vaddpd	%%ymm8,%%ymm12,%%ymm12	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4		\n\t		vaddpd	%%ymm9,%%ymm13,%%ymm13	\n\t"\
		"vmovaps	%%ymm5,0xc0(%%rsi)		\n\t		vmovaps	%%ymm12,    (%%r8)	\n\t"\
		"vmovaps	%%ymm4,0xa0(%%rsi)		\n\t		vmovaps	%%ymm13,0x20(%%r8)	\n\t"\
		"												vsubpd	%%ymm15,%%ymm10,%%ymm10	\n\t"\
		"												vsubpd	%%ymm14,%%ymm11,%%ymm11	\n\t"\
		"												vaddpd	%%ymm15,%%ymm15,%%ymm15	\n\t"\
		"												vaddpd	%%ymm14,%%ymm14,%%ymm14	\n\t"\
		"												vmovaps	%%ymm10,0x80(%%r8)	\n\t"\
		"												vmovaps	%%ymm11,0xe0(%%r8)	\n\t"\
		"												vaddpd	%%ymm10,%%ymm15,%%ymm15	\n\t"\
		"												vaddpd	%%ymm11,%%ymm14,%%ymm14	\n\t"\
		"												vmovaps	%%ymm15,0xc0(%%r8)	\n\t"\
		"												vmovaps	%%ymm14,0xa0(%%r8)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__in1] "m" (Xin1)\
		,[__in2] "m" (Xin2)\
		,[__in3] "m" (Xin3)\
		,[__in4] "m" (Xin4)\
		,[__in5] "m" (Xin5)\
		,[__in6] "m" (Xin6)\
		,[__in7] "m" (Xin7)\
		,[__in8] "m" (Xin8)\
		,[__in9] "m" (Xin9)\
		,[__ina] "m" (Xina)\
		,[__inb] "m" (Xinb)\
		,[__inc] "m" (Xinc)\
		,[__ind] "m" (Xind)\
		,[__ine] "m" (Xine)\
		,[__inf] "m" (Xinf)\
		,[__two] "m" (Xtwo)\
		,[__isrt2] "m" (Xisrt2)\
		,[__out0] "m" (Xout0)\
		,[__out1] "m" (Xout1)\
		,[__out2] "m" (Xout2)\
		,[__out3] "m" (Xout3)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

   #else

	#define SSE2_RADIX16_DIF_0TWIDDLE_B(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0)\
	{\
	__asm__ volatile (\
		/* SSE2_RADIX4_DIF_IN_PLACE(r1 , r17, r9 , r25): */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rcx	\n\t"/* __in0 +   [4*istride]; note BR of [a,b,c,d]-ptrs, i.e. b/c swap */\
		"leaq	%c[__i4](%%rcx),%%rbx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
		/* SSE2_RADIX4_DIF_IN_PLACE(r5 , r21, r13, r29): */\
		"addq	$%c[__i2],%%rax	\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__i2],%%rbx	\n\t"\
		"addq	$%c[__i2],%%rcx	\n\t"\
		"addq	$%c[__i2],%%rdx	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
		/* SSE2_RADIX4_DIF_IN_PLACE(r3 , r19, r11, r27): */\
		"subq	$%c[__i1],%%rax	\n\t"/* All addresses -= 1*ostride */\
		"subq	$%c[__i1],%%rbx	\n\t"\
		"subq	$%c[__i1],%%rcx	\n\t"\
		"subq	$%c[__i1],%%rdx	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
		/* SSE2_RADIX4_DIF_IN_PLACE(r7 , r23, r15, r31): */\
		"addq	$%c[__i2],%%rax	\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__i2],%%rbx	\n\t"\
		"addq	$%c[__i2],%%rcx	\n\t"\
		"addq	$%c[__i2],%%rdx	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
	/*** Now do 4 DFTs with internal twiddles on the 1*stride - separated data. Do blocks in order 0,2,1,3 to allow increment-only of rsi-datum from 1 block to the next: ***/\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"vmovaps	(%%rdi),%%ymm10	\n\t"/* isrt2 */\
		/* Block 0: r0-3 */\
		"movq	%[__out0],%%rsi	\n\t"\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride; note BR of [a,b,c,d]-ptrs, i.e. b/c swap */\
		"leaq	%c[__i1](%%rcx),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i1](%%rbx),%%rdx	\n\t"/* __in0 + 3*istride */\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps		(%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vsubpd		(%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	    (%%rax),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm3,%%ymm3	\n\t"\
		"vmovaps		(%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vsubpd		(%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rcx),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rcx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm2,0x40(%%rsi)	\n\t"\
		"vmovaps	%%ymm3,0x60(%%rsi)	\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm6,    (%%rsi)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rsi)	\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm0,0x80(%%rsi)	\n\t"\
		"vmovaps	%%ymm1,0xe0(%%rsi)	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm5,0xc0(%%rsi)	\n\t"\
		"vmovaps	%%ymm4,0xa0(%%rsi)	\n\t"\
		/* Block 2: */\
		"addq	$0x200,%%rsi	\n\t"\
		"addq	$%c[__i4],%%rax	\n\t"/* All addresses += 4*ostride */\
		"addq	$%c[__i4],%%rbx	\n\t"\
		"addq	$%c[__i4],%%rcx	\n\t"\
		"addq	$%c[__i4],%%rdx	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps		(%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmovaps	%%ymm4,%%ymm0		\n\t"\
		"vmovaps	%%ymm6,%%ymm2		\n\t"\
		"vmovaps	%%ymm5,%%ymm1		\n\t"\
		"vmovaps	%%ymm7,%%ymm3		\n\t"\
		"vmovaps	0x20(%%rdi),%%ymm8	\n\t"/* cc0 */\
		"vmovaps	0x40(%%rdi),%%ymm9	\n\t"/* ss0 */\
		"vmulpd	%%ymm8,%%ymm4,%%ymm4	\n\t"\
		"vmulpd	%%ymm9,%%ymm6,%%ymm6	\n\t"\
		"vmulpd	%%ymm9,%%ymm1,%%ymm1	\n\t"\
		"vmulpd	%%ymm8,%%ymm3,%%ymm3	\n\t"\
		"vmulpd	%%ymm8,%%ymm5,%%ymm5	\n\t"\
		"vmulpd	%%ymm9,%%ymm7,%%ymm7	\n\t"\
		"vmulpd	%%ymm9,%%ymm0,%%ymm0	\n\t"\
		"vmulpd	%%ymm8,%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%%ymm1,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vaddpd		(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	%%ymm10,%%ymm2,%%ymm2	\n\t"/* mul by isrt2 */\
		"vmulpd	%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	    (%%rax),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm2,0x40(%%rsi)	\n\t"\
		"vmovaps	%%ymm3,0x60(%%rsi)	\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm6,    (%%rsi)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rsi)	\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm0,0x80(%%rsi)	\n\t"\
		"vmovaps	%%ymm1,0xe0(%%rsi)	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm5,0xc0(%%rsi)	\n\t"\
		"vmovaps	%%ymm4,0xa0(%%rsi)	\n\t"\
		/* Block 1: r8-b */\
		"subq	$0x100,%%rsi	\n\t"\
		"addq	$%c[__i4],%%rax	\n\t"/* All addresses += 4*ostride */\
		"addq	$%c[__i4],%%rbx	\n\t"\
		"addq	$%c[__i4],%%rcx	\n\t"\
		"addq	$%c[__i4],%%rdx	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps		(%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vsubpd		(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm2,%%ymm2	\n\t"\
		"vaddpd		(%%rax),%%ymm3,%%ymm3	\n\t"\
		"vmovaps		(%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vsubpd	0x20(%%rcx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd		(%%rcx),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd		(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"vmulpd	%%ymm10,%%ymm4,%%ymm4		\n\t"\
		"vmulpd	%%ymm10,%%ymm5,%%ymm5		\n\t"\
		"vmulpd	%%ymm10,%%ymm6,%%ymm6		\n\t"\
		"vmulpd	%%ymm10,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm0,0x40(%%rsi)	\n\t"\
		"vmovaps	%%ymm2,0x60(%%rsi)	\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5	\n\t"\
		"vmovaps	%%ymm4,    (%%rsi)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rsi)	\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm6,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm3,0x80(%%rsi)	\n\t"\
		"vmovaps	%%ymm1,0xe0(%%rsi)	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm1,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm7,0xc0(%%rsi)	\n\t"\
		"vmovaps	%%ymm6,0xa0(%%rsi)	\n\t"\
		/* Block 3: */\
		"addq	$0x200,%%rsi	\n\t"\
		"addq	$%c[__i4],%%rax	\n\t"/* All addresses += 4*ostride */\
		"addq	$%c[__i4],%%rbx	\n\t"\
		"addq	$%c[__i4],%%rcx	\n\t"\
		"addq	$%c[__i4],%%rdx	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps		(%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmovaps	%%ymm4,%%ymm0		\n\t"\
		"vmovaps	%%ymm6,%%ymm2		\n\t"\
		"vmovaps	%%ymm5,%%ymm1		\n\t"\
		"vmovaps	%%ymm7,%%ymm3		\n\t"\
		"vmulpd	%%ymm9,%%ymm4,%%ymm4	\n\t"\
		"vmulpd	%%ymm8,%%ymm6,%%ymm6	\n\t"\
		"vmulpd	%%ymm8,%%ymm1,%%ymm1	\n\t"\
		"vmulpd	%%ymm9,%%ymm3,%%ymm3	\n\t"\
		"vmulpd	%%ymm9,%%ymm5,%%ymm5	\n\t"\
		"vmulpd	%%ymm8,%%ymm7,%%ymm7	\n\t"\
		"vmulpd	%%ymm8,%%ymm0,%%ymm0	\n\t"\
		"vmulpd	%%ymm9,%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%%ymm1,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	    (%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	%%ymm10,%%ymm2,%%ymm2	\n\t"/* mul by isrt2 */\
		"vmulpd	%%ymm10,%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	    (%%rax),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm0,0x40(%%rsi)	\n\t"\
		"vmovaps	%%ymm1,0x60(%%rsi)	\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5	\n\t"\
		"vmovaps	%%ymm4,    (%%rsi)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rsi)	\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm2,0x80(%%rsi)	\n\t"\
		"vmovaps	%%ymm3,0xe0(%%rsi)	\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm7,0xc0(%%rsi)	\n\t"\
		"vmovaps	%%ymm6,0xa0(%%rsi)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "e" (Xi1)\
		,[__i2] "e" (Xi2)\
		,[__i3] "e" (Xi3)\
		,[__i4] "e" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10"		/* Clobbered registers */\
	);\
	}

   #endif	// 8/16-reg versions of SSE2_RADIX16_DIF_0TWIDDLE_B

	// Based on the SSE2_RADIX16_DIT_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-input addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	// We use just a single output base-pointer plus literal ostrides which are [1,2,3,4]-multiples of
	// __01; this allows us to cut GP-register usage, which is absolutely a must for the 32-bit version
	// of the macro, and is a benefit to the 64-bit versions which code-fold to yield 2 side-by-side
	// streams of independently executable instructions, one for data in xmm0-7, the other using xmm8-15.
	#define SSE2_RADIX16_DIT_0TWIDDLE(Xin0,Xoff, Xisrt2,Xtwo, Xout0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
	"movq	%[in0],%%r8	\n\t	movq	%[off],%%r9	\n\t"/* Load input base-address into r8 and int32[16] offset-array pointer into r9 */\
		"movslq		    (%%r9),%%rax	\n\t"/* off[0-3] */\
		"movslq		0x04(%%r9),%%rbx	\n\t"\
		"movslq		0x08(%%r9),%%rcx	\n\t"\
		"movslq		0x0c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* in0 + off[0-3] */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
	"prefetcht1	0x100(%%rax)\n\t"\
	/* SSE2_RADIX4_DIT_0TWIDDLE_B(r0 ): */\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm0	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"movq	%[__out0],%%rsi		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
	"prefetcht1	0x100(%%rcx)\n\t"\
		"leaq	0x20(%%rsi),%%rdi	\n\t"/* Need separate address Im parts of outputs due to literal-offsets below */\
		"vmovaps	%%ymm0,%c[__o2](%%rsi)	\n\t"\
		"vmovaps	%%ymm2,%c[__o3](%%rsi)	\n\t"\
		"vmovaps	%%ymm1,%c[__o2](%%rdi)	\n\t"\
		"vmovaps	%%ymm3,%c[__o1](%%rdi)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm4,        (%%rsi)	\n\t"\
		"vmovaps	%%ymm7,%c[__o1](%%rsi)	\n\t"\
		"vmovaps	%%ymm5,        (%%rdi)	\n\t"\
		"vmovaps	%%ymm6,%c[__o3](%%rdi)	\n\t"\
	"prefetcht1	0x100(%%rax)\n\t"\
	/* SSE2_RADIX4_DIT_0TWIDDLE_B(r8 ): */\
		"movslq		0x10(%%r9),%%rax	\n\t"/* off[4-7] */\
		"movslq		0x14(%%r9),%%rbx	\n\t"\
		"movslq		0x18(%%r9),%%rcx	\n\t"\
		"movslq		0x1c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* in0 + off[4-7] */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"leaq	%c[__o4](%%rsi),%%rsi		\n\t"/* __out0 + 4*ostride */\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
	"prefetcht1	0x100(%%rcx)\n\t"\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	%%ymm0,%c[__o2](%%rsi)	\n\t"\
		"vmovaps	%%ymm2,%c[__o3](%%rsi)	\n\t"\
		"vmovaps	%%ymm1,%c[__o2](%%rdi)	\n\t"\
		"vmovaps	%%ymm3,%c[__o1](%%rdi)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm4,        (%%rsi)	\n\t"\
		"vmovaps	%%ymm7,%c[__o1](%%rsi)	\n\t"\
		"vmovaps	%%ymm5,        (%%rdi)	\n\t"\
		"vmovaps	%%ymm6,%c[__o3](%%rdi)	\n\t"\
	"prefetcht1	0x100(%%rax)\n\t"\
	/* SSE2_RADIX4_DIT_0TWIDDLE_B(r16): */\
		"movslq		0x20(%%r9),%%rax	\n\t"/* off[8-b] */\
		"movslq		0x24(%%r9),%%rbx	\n\t"\
		"movslq		0x28(%%r9),%%rcx	\n\t"\
		"movslq		0x2c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* in0 + off[8-b] */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"leaq	%c[__o4](%%rsi),%%rsi		\n\t"/* __out0 + 8*ostride */\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2			\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1			\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3			\n\t"\
	"prefetcht1	0x100(%%rcx)\n\t"\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	%%ymm0,%c[__o2](%%rsi)	\n\t"\
		"vmovaps	%%ymm2,%c[__o3](%%rsi)	\n\t"\
		"vmovaps	%%ymm1,%c[__o2](%%rdi)	\n\t"\
		"vmovaps	%%ymm3,%c[__o1](%%rdi)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7			\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6			\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4			\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7			\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5			\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6			\n\t"\
		"vmovaps	%%ymm4,        (%%rsi)	\n\t"\
		"vmovaps	%%ymm7,%c[__o1](%%rsi)	\n\t"\
		"vmovaps	%%ymm5,        (%%rdi)	\n\t"\
		"vmovaps	%%ymm6,%c[__o3](%%rdi)	\n\t"\
	"prefetcht1	0x100(%%rax)\n\t"\
	/* SSE2_RADIX4_DIT_0TWIDDLE_B(r24): */\
		"movslq		0x30(%%r9),%%rax	\n\t"/* off[c-f] */\
		"movslq		0x34(%%r9),%%rbx	\n\t"\
		"movslq		0x38(%%r9),%%rcx	\n\t"\
		"movslq		0x3c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* in0 + off[c-f] */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"leaq	%c[__o4](%%rsi),%%rsi		\n\t"/* __out0 + c*ostride */\
		"vmovaps	    (%%rax),%%ymm2	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm3	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vmovaps	    (%%rbx),%%ymm0	\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vsubpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
	"prefetcht1	0x100(%%rcx)\n\t"\
		"leaq	0x20(%%rsi),%%rdi	\n\t"\
		"vmovaps	%%ymm0,%c[__o2](%%rsi)	\n\t"\
		"vmovaps	%%ymm2,%c[__o3](%%rsi)	\n\t"\
		"vmovaps	%%ymm1,%c[__o2](%%rdi)	\n\t"\
		"vmovaps	%%ymm3,%c[__o1](%%rdi)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm4,        (%%rsi)	\n\t"\
		"vmovaps	%%ymm7,%c[__o1](%%rsi)	\n\t"\
		"vmovaps	%%ymm5,        (%%rdi)	\n\t"\
		"vmovaps	%%ymm6,%c[__o3](%%rdi)	\n\t"\
	/*** Now do 4 DFTs with internal twiddles on the 4*stride - separated data: ***/\
		"movq	%[__out0],%%rax		\n\t"\
		"leaq	%c[__o4](%%rax),%%rbx	\n\t"/* __out0 +   [4*ostride] */\
		"leaq	%c[__o4](%%rbx),%%rcx	\n\t"/* __out0 + 2*[4*ostride] */\
		"leaq	%c[__o4](%%rcx),%%rdx	\n\t"/* __out0 + 3*[4*ostride] */\
		/* Block 0: */\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vsubpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	    (%%rax),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm3,%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vsubpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rcx),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rcx),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rdx)	\n\t"\
		/* Block 2: */\
		"addq	$%c[__o2],%%rax		\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__o2],%%rbx		\n\t"\
		"addq	$%c[__o2],%%rcx		\n\t"\
		"addq	$%c[__o2],%%rdx		\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"vmovaps	(%%rdi),%%ymm2		\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1	\n\t"\
		"vaddpd	0x20(%%rcx),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	    (%%rcx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	    (%%rdx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	%%ymm2,%%ymm4,%%ymm4					\n\t"\
		"vmulpd	%%ymm2,%%ymm5,%%ymm5					\n\t"\
		"vmulpd	%%ymm2,%%ymm0,%%ymm0					\n\t"\
		"vmulpd	%%ymm2,%%ymm1,%%ymm1					\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vsubpd	%%ymm0,%%ymm4,%%ymm4					\n\t"\
		"vsubpd	%%ymm1,%%ymm5,%%ymm5					\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6					\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7					\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vsubpd	    (%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	    (%%rax),%%ymm3,%%ymm3	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm3,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vmovaps	%%ymm0,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm2,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rdx)	\n\t"\
		/* Block 1: */\
		"subq	$%c[__o1],%%rax		\n\t"/* All subresses += 1*ostride */\
		"subq	$%c[__o1],%%rbx		\n\t"\
		"subq	$%c[__o1],%%rcx		\n\t"\
		"subq	$%c[__o1],%%rdx		\n\t"\
		"leaq	0x20(%%rdi),%%rsi	\n\t"/* cc0 */\
		"vmovaps	    (%%rdx),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1	\n\t"\
		"vmovaps	    (%%rdx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm3	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm0,%%ymm0	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	    (%%rsi),%%ymm2,%%ymm2	\n\t"\
		"vmulpd	    (%%rsi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	    (%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	    (%%rdi),%%ymm2,%%ymm2	\n\t"\
		"vmulpd	    (%%rdi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm3,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm7,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm1,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rdx)	\n\t"\
		/* Block 3: */\
		"addq	$%c[__o2],%%rax		\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__o2],%%rbx		\n\t"\
		"addq	$%c[__o2],%%rcx		\n\t"\
		"addq	$%c[__o2],%%rdx		\n\t"\
		"vmovaps	    (%%rdx),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1	\n\t"\
		"vmovaps	    (%%rdx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm3	\n\t"\
		"vmulpd	    (%%rsi),%%ymm0,%%ymm0	\n\t"\
		"vmulpd	    (%%rsi),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm2,%%ymm2	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm4,%%ymm4	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	    (%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	    (%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	    (%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	    (%%rdi),%%ymm2,%%ymm2	\n\t"\
		"vmulpd	    (%%rdi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rdx)	\n\t"\
		:					/* outputs: none */\
		:[in0] "m" (Xin0)	/* Input-address-16-tet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 16 double* index offsets */\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		,[__o1] "e" (Xo1)\
		,[__o2] "e" (Xo2)\
		,[__o3] "e" (Xo3)\
		,[__o4] "e" (Xo4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

  #endif	// AVX2/FMA3?

	/* With-twiddles out-of-place analog of above twiddleless DIT macro: 15 nontrivial complex input twiddles E1-f [E0 assumed = 1],
	The DIT version of this macro processes the twiddles in-order.
	NOTE: SINCE THIS MACRO IS SPECIFICALLY DESIGNED AS THE 2ND-PASS OF LARGE-POWER-OF-2-TWIDDLELESS DFT SYNTHESIS, THE
	"TWIDDLES" HERE ARE PURELY OF THE DFT-INTERNAL VARIETY, AND THUS APPLIED TO THE INPUTS, JUST AS FOR THE ABOVE DIF COUNTERPART.

	Sincos layout - Two portions:
	[NOTE: bytewise offsets below are w.r.to SSE2 version of code; AVX doubles these]

	Radix-16 shared consts anchored at isrt2:

	  isrt2 + 0x000;	cc0 + 0x010;	ss0 + 0x020;

	Per-block-specific set of 15 complex twiddles anchored at c1:

		c1  + 0x000;	s1  + 0x010;
		c2  + 0x020;	s2  + 0x030;
		c3  + 0x040;	s3  + 0x050;
		c4  + 0x060;	s4  + 0x070;
		c5  + 0x080;	s5  + 0x090;
		c6  + 0x0a0;	s6  + 0x0b0;
		c7  + 0x0c0;	s7  + 0x0d0;
		c8  + 0x0e0;	s8  + 0x0f0;
		c9  + 0x100;	s9  + 0x110;
		c10 + 0x120;	s10 + 0x130;
		c11 + 0x140;	s11 + 0x150;
		c12 + 0x160;	s12 + 0x170;
		c13 + 0x180;	s13 + 0x190;
		c14 + 0x1a0;	s14 + 0x1b0;
		c15 + 0x1c0;	s15 + 0x1d0;

	Use radix-16 DIF as template for DIT/OOP here, since need a pre-twiddles algorithm:
	*/
	#define SSE2_RADIX16_DIT_TWIDDLE_OOP(Xin0,Xi1,Xi2,Xi3,Xi4, Xout0,Xo1,Xo2,Xo3,Xo4, Xisrt2,Xc1)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i2](%%rax),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i3](%%rax),%%rdx	\n\t"/* __in0 + 3*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"movq	%[__c1],%%rsi 	/* c1 */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* c2,3 */\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* c3 */\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	/* c2 */\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rax),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		/* DIT has outputs (indexed in real-temp form as 0-7) 2/6,3/7 swapped, i.e. swap oregs c/d vs DIF: */\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* c4,5 */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rsi),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm7	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vmulpd	%%ymm6,%%ymm0,%%ymm0		/* c4 */\n\t"\
		"vmulpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* c5 */\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* c6,7 */\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* c7 */\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	/* c6 */\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rax),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* c8,9 */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rsi),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm7	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vmulpd	%%ymm6,%%ymm0,%%ymm0		/* c8 */\n\t"\
		"vmulpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* c9 */\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* ca,b */\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* cb */\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	/* ca */\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rax),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* cc,d */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rsi),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm7	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vmulpd	%%ymm6,%%ymm0,%%ymm0		/* cc */\n\t"\
		"vmulpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* cd */\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* ce,f */\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* cf */\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	/* ce */\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rax),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rdx)	\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
		"movq	%[__isrt2],%%rsi 	\n\t"\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"movq	%[__out0],%%r10		\n\t"\
		"leaq	%c[__o4](%%r10),%%r11	\n\t"/* __out0 + 4*ostride */\
		"leaq	%c[__o4](%%r11),%%r12	\n\t"/* __out0 + 8*ostride */\
		"leaq	%c[__o4](%%r12),%%r13	\n\t"/* __out0 + c*ostride */\
		"vmovaps	%%ymm2,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%r12)	\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm6,    (%%r10)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%r10)	\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm0,    (%%r13)	\n\t"/* These 2 outputs [4/c] swapped w.r.to dif [2/3] due to +-I sign diff */\
		"vmovaps	%%ymm1,0x20(%%r11)	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm5,    (%%r11)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%r13)	\n\t"\
	/* Block 1: Combine 1-output of each radix-4, i.e. inputs from __in0 + [1,5,9,d]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
		"vmovaps	    (%%rdx),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1	\n\t"\
		"vmovaps	    (%%rdx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm3	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm0,%%ymm0	\n\t"/* ss0 */\
		"vmulpd	0x40(%%rsi),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm2,%%ymm2	\n\t"/* cc0 */\
		"vmulpd	0x20(%%rsi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm4,%%ymm4	\n\t"/* cc0 */\
		"vmulpd	0x20(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm6,%%ymm6	\n\t"/* ss0 */\
		"vmulpd	0x40(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	    (%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	    (%%rsi),%%ymm2,%%ymm2	\n\t"/* isrt2 */\
		"vmulpd	    (%%rsi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 1*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + 5*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + 9*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + d*ostride */\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%r12)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm3,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%r10)	\n\t"\
		"vsubpd	%%ymm7,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%r13)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%r11)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm1,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%r11)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%r13)	\n\t"\
	/* Block 2: Combine 2-output of each radix-4, i.e. inputs from __in0 + [2,6,a,e]*istride: */\
		"vmovaps	(%%rsi),%%ymm2	/* isrt2 */\n\t"\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1	\n\t"\
		"vaddpd	0x20(%%rcx),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	    (%%rcx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	    (%%rdx),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	%%ymm2,%%ymm4,%%ymm4		\n\t"\
		"vmulpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vmulpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vsubpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vsubpd	    (%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	    (%%rax),%%ymm3,%%ymm3	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm2,%%ymm2	\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 2*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + 6*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + a*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + e*ostride */\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm3,    (%%r12)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%r12)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm4,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%r10)	\n\t"\
		"vsubpd	%%ymm7,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vmovaps	%%ymm0,    (%%r13)	\n\t"\
		"vmovaps	%%ymm2,0x20(%%r11)	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm7,    (%%r11)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%r13)	\n\t"\
	/* Block 3: Combine 3-output of each radix-4, i.e. inputs from __in0 + [3,7,b,f]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
		"vmovaps	    (%%rdx),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1	\n\t"\
		"vmovaps	    (%%rdx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm3	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm0,%%ymm0	\n\t"/* cc0 */\
		"vmulpd	0x20(%%rsi),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm2,%%ymm2	\n\t"/* ss0 */\
		"vmulpd	0x40(%%rsi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rcx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	\n\t"/* ss0 */\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"/* cc0 */\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	    (%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	    (%%rsi),%%ymm2,%%ymm2	\n\t"/* isrt2 */\
		"vmulpd	    (%%rsi),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 3*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + 7*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + b*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + f*ostride */\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm0,    (%%r12)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%r12)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	%%ymm6,    (%%r10)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%r10)	\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,    (%%r13)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%r11)	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,    (%%r11)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%r13)	\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__o1] "e" (Xo1)\
		 ,[__o2] "e" (Xo2)\
		 ,[__o3] "e" (Xo3)\
		 ,[__o4] "e" (Xo4)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__c1] "m" (Xc1)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// DIF version of above shares same sincos layout & data:
	#define SSE2_RADIX16_DIF_TWIDDLE_OOP(Xin0,Xi1,Xi4, Xout0,Xoff, Xisrt2,Xc1)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i1](%%rcx),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i1](%%rbx),%%rdx	\n\t"/* __in0 + 3*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"movq	%[__c1],%%rsi 	/* Roots sets c1-15 same as for DIT, w/c1 as base-ptr */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* c2,3 */\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* c3 */\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	/* c2 */\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rax),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* c4,5 */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rsi),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm7	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vmulpd	%%ymm6,%%ymm0,%%ymm0		/* c4 */\n\t"\
		"vmulpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* c5 */\n\t"\
		"vsubpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* c6,7 */\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* c7 */\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	/* c6 */\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rax),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* c8,9 */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rsi),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm7	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vmulpd	%%ymm6,%%ymm0,%%ymm0		/* c8 */\n\t"\
		"vmulpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* c9 */\n\t"\
		"vsubpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* ca,b */\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* cb */\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	/* ca */\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rax),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* cc,d */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rsi),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm7	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vmulpd	%%ymm6,%%ymm0,%%ymm0		/* cc */\n\t"\
		"vmulpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vmulpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* cd */\n\t"\
		"vsubpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm0,%%ymm2		\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm1,%%ymm3		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x80,%%rsi 	/* ce,f */\n\t"\
		"vmovaps	    (%%rdx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm4,%%ymm4	/* cf */\n\t"\
		"vmulpd	0x40(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x60(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	    (%%rbx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"vmulpd	    (%%rsi),%%ymm4,%%ymm4	/* ce */\n\t"\
		"vmulpd	    (%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rsi),%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rax),%%ymm6,%%ymm6	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 bfly, store results: */\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm5,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%rbx)	\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
	/* Load output base-address into r8 and offset-array pointer into r9: */\
	"movq	%[__out0],%%r8	\n\t	movq	%[__off],%%r9	\n\t"\
	/* Block 0: r0-3 */\
		"movslq		    (%%r9),%%r10	\n\t"/* off0-3 */\
		"movslq		0x04(%%r9),%%r11	\n\t"\
		"movslq		0x08(%%r9),%%r12	\n\t"\
		"movslq		0x0c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off0-3 */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%r11)\n\t"\
		"vmovaps	%%ymm2,    (%%r11)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%r11)	\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm6,    (%%r10)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%r10)	\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm0,    (%%r12)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%r13)	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm5,    (%%r13)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%r12)	\n\t"\
	/* Block 2: Combine 2-output of each radix-4, i.e. inputs from __in0 + [4,5,6,7]*istride: */\
		"movq	%[__isrt2],%%rsi 	\n\t"\
		"vmovaps	(%%rsi),%%ymm3	/* isrt2 */\n\t"\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
	"prefetcht1	0x100(%%r13)\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vmulpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmulpd	%%ymm3,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmulpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmulpd	%%ymm3,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vsubpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm5,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm2,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm6,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm4,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm1,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm7,%%ymm6,%%ymm6		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm6,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"movslq		0x10(%%r9),%%r10	\n\t"/* off4-7 */\
		"movslq		0x14(%%r9),%%r11	\n\t"\
		"movslq		0x18(%%r9),%%r12	\n\t"\
		"movslq		0x1c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off4-7 */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%r11)\n\t"\
		"vmovaps	%%ymm0,    (%%r11)	\n\t"\
		"vmovaps	%%ymm3,    (%%r12)	\n\t"\
		"vmovaps	%%ymm2,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%r13)	\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm2,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm1,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm4,    (%%r10)	\n\t"\
		"vmovaps	%%ymm7,    (%%r13)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%r12)	\n\t"\
	/* Block 1: Combine 1-output of each radix-4, i.e. inputs from __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
	"prefetcht1	0x100(%%r13)\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm3	/* cc0, using isrt2 as base-ptr */\n\t"\
		"vmovaps	0x40(%%rsi),%%ymm2	/* ss0, using isrt2 as base-ptr */\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmulpd	%%ymm3,%%ymm5,%%ymm5		\n\t"\
		"vmulpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	    (%%rdx),%%ymm0	\n\t"\
		"vmulpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm0,%%ymm6		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm1,%%ymm7		\n\t"\
		"vmulpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vmulpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vmulpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm0,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm1,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm4,%%ymm2		\n\t"\
		"vmovaps	%%ymm5,%%ymm3		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	    (%%rsi),%%ymm1	/* isrt2 */\n\t"\
		"vmovaps	%%ymm2,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm0,%%ymm3,%%ymm3		\n\t"\
		"vmulpd	%%ymm1,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"movslq		0x20(%%r9),%%r10	\n\t"/* off8-b */\
		"movslq		0x24(%%r9),%%r11	\n\t"\
		"movslq		0x28(%%r9),%%r12	\n\t"\
		"movslq		0x2c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off8-b */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%r11)\n\t"\
		"vmovaps	%%ymm2,    (%%r11)	\n\t"\
		"vmovaps	%%ymm0,    (%%r12)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%r13)	\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm0,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm1,%%ymm4,%%ymm4	\n\t"\
		"vmovaps	%%ymm6,    (%%r10)	\n\t"\
		"vmovaps	%%ymm5,    (%%r13)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm4,0x20(%%r12)	\n\t"\
	/* Block 3: Combine 3-output of each radix-4, i.e. inputs from __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
	"prefetcht1	0x100(%%r13)\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm2	/* cc0, using isrt2 as base-ptr */\n\t"\
		"vmovaps	0x40(%%rsi),%%ymm3	/* ss0, using isrt2 as base-ptr */\n\t"\
		"vmovaps	%%ymm4,%%ymm6		\n\t"\
		"vmovaps	%%ymm5,%%ymm7		\n\t"\
		"vmulpd	%%ymm3,%%ymm4,%%ymm4		\n\t"\
		"vmulpd	%%ymm3,%%ymm5,%%ymm5		\n\t"\
		"vmulpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	    (%%rdx),%%ymm0	\n\t"\
		"vmulpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm1	\n\t"\
		"vaddpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm0,%%ymm6		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmovaps	%%ymm1,%%ymm7		\n\t"\
		"vmulpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vmulpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vmulpd	%%ymm3,%%ymm0,%%ymm0		\n\t"\
		"vmulpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm0,%%ymm7,%%ymm7		\n\t"\
		"vsubpd	%%ymm1,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm4,%%ymm2		\n\t"\
		"vmovaps	%%ymm5,%%ymm3		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm3,%%ymm7,%%ymm7		\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	    (%%rsi),%%ymm1		/* isrt2 */\n\t"\
		"vmovaps	%%ymm2,%%ymm0		\n\t"\
		"vaddpd	%%ymm3,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm0,%%ymm3,%%ymm3		\n\t"\
		"vmulpd	%%ymm1,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"movslq		0x30(%%r9),%%r10	\n\t"/* offc-f */\
		"movslq		0x34(%%r9),%%r11	\n\t"\
		"movslq		0x38(%%r9),%%r12	\n\t"\
		"movslq		0x3c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + offc-f */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%r11)\n\t"\
		"vmovaps	%%ymm0,    (%%r11)	\n\t"\
		"vmovaps	%%ymm2,    (%%r12)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%r11)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%r13)	\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6	\n\t"\
		"vmovaps	%%ymm4,    (%%r10)	\n\t"\
		"vmovaps	%%ymm7,    (%%r13)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%r10)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%r12)	\n\t"\
	"prefetcht1	0x100(%%r13)\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__off] "m" (Xoff)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__c1] "m" (Xc1)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r8","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_03_DFT(Xi0,Xi1,Xi2, Xcc1, Xo0,Xo1,Xo2)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax				\n\t"\
		"movq	%[__i1],%%rbx				\n\t"\
		"movq	%[__i2],%%rcx				\n\t"\
		"movq	%[__cc1],%%rdx				\n\t"\
		"vmovaps	    (%%rbx),%%ymm2		\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3		\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t"\
		"vmovaps	    (%%rcx),%%ymm6		\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm7		\n\t"\
		"vmovaps	%%ymm2,%%ymm4			\n\t"\
		"vmovaps	%%ymm3,%%ymm5			\n\t"\
		"movq	%[__o0],%%rax				\n\t"\
		"movq	%[__o1],%%rbx				\n\t"\
		"movq	%[__o2],%%rcx				\n\t"\
		"vaddpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm7,%%ymm3,%%ymm3		\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vaddpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	    (%%rdx),%%ymm6		\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7		\n\t"\
		"vmovaps	%%ymm0,    (%%rax)		\n\t"\
		"vmovaps	%%ymm1,0x20(%%rax)		\n\t"\
		"vmulpd	%%ymm6,%%ymm2,%%ymm2		\n\t"\
		"vmulpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmulpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vmulpd	%%ymm7,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm2,%%ymm0			\n\t"\
		"vmovaps	%%ymm3,%%ymm1			\n\t"\
		"vsubpd	%%ymm5,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm4,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm5,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm4,%%ymm1,%%ymm1		\n\t"\
		"vmovaps	%%ymm2,    (%%rbx)		\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)		\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX4_DIF_0TWIDDLE_STRIDE(Xadd0, Xadd1, Xadd2, Xadd3, Xtmp, Xstride)\
	{\
	__asm__ volatile (\
		"movq	%[__tmp]   ,%%rax	\n\t"\
		"movq	%[__stride],%%rsi	\n\t"\
		"movq	%%rax,%%rbx			\n\t"\
		"addq	%%rsi,%%rbx			/* add_in1  */\n\t"\
		"shlq	$1,%%rsi			/* stride*2 */\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	    (%%rax),%%ymm4	\n\t"\
		"vmovaps	    (%%rbx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm5	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm7	\n\t"\
		"addq	%%rsi,%%rax			/* add_in2  */\n\t"\
		"addq	%%rsi,%%rbx			/* add_in3  */\n\t"\
		"vaddpd	    (%%rax),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vaddpd	0x20(%%rax),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	    (%%rbx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into main-array slots: */\n\t"\
		"movq	%[__add0],%%rax		\n\t"\
		"movq	%[__add1],%%rbx		\n\t"\
		"movq	%[__add2],%%rcx		\n\t"\
		"movq	%[__add3],%%rdx		\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm4,%%ymm4		\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm6,%%ymm5,%%ymm5		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm4,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2		\n\t"\
		"vaddpd	%%ymm4,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3		\n\t"\
		"vaddpd	%%ymm5,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm2,    (%%rax)	\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)	\n\t"\
		:					/* outputs: none */\
		: [__add0] "m" (Xadd0)	/* All inputs from memory addresses here */\
		 ,[__add1] "m" (Xadd1)\
		 ,[__add2] "m" (Xadd2)\
		 ,[__add3] "m" (Xadd3)\
		 ,[__tmp] "m" (Xtmp)\
		 ,[__stride] "e" (Xstride)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	/* DIF radix-4 subconvolution, sans twiddles, inputs in __i0-3, outputs in __o0-3, possibly coincident with inputs: */
	#define SSE2_RADIX4_DIF_0TWIDDLE_STRIDE_E(Xi0,Xi1,Xi2,Xi3, Xo0,Xo1,Xo2,Xo3)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax		\n\t"\
		"movq	%[__i1],%%rbx		\n\t"\
		"movq	%[__i2],%%rcx		\n\t"\
		"movq	%[__i3],%%rdx		\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t"\
		"vmovaps	    (%%rbx),%%ymm4		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm5		\n\t"\
		"vmovaps	%%ymm0,%%ymm2			\n\t"\
		"vmovaps	%%ymm4,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm3			\n\t"\
		"vmovaps	%%ymm5,%%ymm7			\n\t"\
		"vaddpd	    (%%rcx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rcx),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rcx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rcx),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into main-array slots: */\n\t"\
		"movq	%[__o0],%%rax					\n\t"\
		"movq	%[__o1],%%rbx					\n\t"\
		"movq	%[__o2],%%rcx					\n\t"\
		"movq	%[__o3],%%rdx					\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rbx)		\n\t"\
		"vmovaps	%%ymm2,    (%%rcx)		\n\t"\
		"vmovaps	%%ymm1,0x20(%%rbx)		\n\t"\
		"vmovaps	%%ymm3,0x20(%%rdx)		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)		\n\t"\
		"vmovaps	%%ymm7,    (%%rdx)		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm6,0x20(%%rcx)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX4_DIT_0TWIDDLE_STRIDE(Xadd0, Xadd1, Xadd2, Xadd3, Xtmp, Xstride)\
	{\
	__asm__ volatile (\
		"movq	%[__add0],%%rax		\n\t"\
		"movq	%[__add1],%%rbx		\n\t"\
		"movq	%[__add2],%%rcx		\n\t"\
		"movq	%[__add3],%%rdx		\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	%%ymm0,%%ymm2			\n\t"\
		"vmovaps	%%ymm4,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm3			\n\t"\
		"vmovaps	%%ymm5,%%ymm7			\n\t"\
		"movq	%[__tmp]   ,%%rax	\n\t"\
		"movq	%[__stride],%%rcx	\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"movq	%%rax,%%rbx			\n\t"\
		"addq	%%rcx,%%rbx			\n\t"\
		"movq	%%rbx,%%rdx			\n\t"\
		"addq	%%rcx,%%rcx			\n\t"\
		"addq	%%rcx,%%rdx			\n\t"\
		"addq	%%rax,%%rcx			\n\t"\
		"/* Finish radix-4 butterfly and store results into temp-array slots: */\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0			\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2			\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1			\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3			\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm2,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)	\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4			\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7			\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5			\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6			\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4			\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7			\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5			\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6			\n\t"\
		"vmovaps	%%ymm4,    (%%rax)	\n\t"\
		"vmovaps	%%ymm7,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)	\n\t"\
		"vmovaps	%%ymm6,0x20(%%rdx)	\n\t"\
		:					/* outputs: none */\
		: [__add0] "m" (Xadd0)	/* All inputs from memory addresses here */\
		 ,[__add1] "m" (Xadd1)\
		 ,[__add2] "m" (Xadd2)\
		 ,[__add3] "m" (Xadd3)\
		 ,[__tmp] "m" (Xtmp)\
		 ,[__stride] "e" (Xstride)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	/* DIT radix-4 subconvolution, sans twiddles, inputs in __i0-3, outputs in __o0-3, possibly coincident with inputs: */
	#define SSE2_RADIX4_DIT_0TWIDDLE_STRIDE_E(Xi0,Xi1,Xi2,Xi3, Xo0,Xo1,Xo2,Xo3)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax		\n\t"\
		"movq	%[__i1],%%rbx		\n\t"\
		"movq	%[__i2],%%rcx		\n\t"\
		"movq	%[__i3],%%rdx		\n\t"\
		"vmovaps	    (%%rax),%%ymm0		\n\t"\
		"vmovaps	    (%%rcx),%%ymm4		\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1		\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5		\n\t"\
		"vmovaps	%%ymm0,%%ymm2			\n\t"\
		"vmovaps	%%ymm4,%%ymm6			\n\t"\
		"vmovaps	%%ymm1,%%ymm3			\n\t"\
		"vmovaps	%%ymm5,%%ymm7			\n\t"\
		"vaddpd	    (%%rbx),%%ymm0,%%ymm0	\n\t"\
		"vaddpd	    (%%rdx),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rbx),%%ymm1,%%ymm1	\n\t"\
		"vaddpd	0x20(%%rdx),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	    (%%rbx),%%ymm2,%%ymm2	\n\t"\
		"vsubpd	    (%%rdx),%%ymm6,%%ymm6	\n\t"\
		"vsubpd	0x20(%%rbx),%%ymm3,%%ymm3	\n\t"\
		"vsubpd	0x20(%%rdx),%%ymm7,%%ymm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into output-array slots: */\n\t"\
		"movq	%[__o0],%%rax					\n\t"\
		"movq	%[__o1],%%rbx					\n\t"\
		"movq	%[__o2],%%rcx					\n\t"\
		"movq	%[__o3],%%rdx					\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0		\n\t"\
		"vsubpd	%%ymm7,%%ymm2,%%ymm2		\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1		\n\t"\
		"vsubpd	%%ymm6,%%ymm3,%%ymm3		\n\t"\
		"vmovaps	%%ymm0,    (%%rcx)		\n\t"\
		"vmovaps	%%ymm2,    (%%rdx)		\n\t"\
		"vmovaps	%%ymm1,0x20(%%rcx)		\n\t"\
		"vmovaps	%%ymm3,0x20(%%rbx)		\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6		\n\t"\
		"vaddpd	%%ymm0,%%ymm4,%%ymm4		\n\t"\
		"vaddpd	%%ymm2,%%ymm7,%%ymm7		\n\t"\
		"vaddpd	%%ymm1,%%ymm5,%%ymm5		\n\t"\
		"vaddpd	%%ymm3,%%ymm6,%%ymm6		\n\t"\
		"vmovaps	%%ymm4,    (%%rax)		\n\t"\
		"vmovaps	%%ymm7,    (%%rbx)		\n\t"\
		"vmovaps	%%ymm5,0x20(%%rax)		\n\t"\
		"vmovaps	%%ymm6,0x20(%%rdx)		\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_05_DFT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4, Xcc1, Xo0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rsi		\n\t"\
		"movq	%[__i1],%%rax		\n\t"\
		"movq	%[__i2],%%rbx		\n\t"\
		"movq	%[__i3],%%rcx		\n\t"\
		"movq	%[__i4],%%rdx		\n\t"\
		"movq	%[__o0],%%rdi		\n\t"\
		"vmovaps	    (%%rax),%%ymm0	\n\t"\
		"vmovaps	0x20(%%rax),%%ymm1	\n\t"\
		"vmovaps	    (%%rbx),%%ymm2	\n\t"\
		"vmovaps	0x20(%%rbx),%%ymm3	\n\t"\
		"vmovaps	    (%%rcx),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rcx),%%ymm5	\n\t"\
		"vmovaps	    (%%rdx),%%ymm6	\n\t"\
		"vmovaps	0x20(%%rdx),%%ymm7	\n\t"\
		"vsubpd	%%ymm6,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm7,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm0,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm1,%%ymm7,%%ymm7	\n\t"\
		"vsubpd	%%ymm4,%%ymm2,%%ymm2	\n\t"\
		"vsubpd	%%ymm5,%%ymm3,%%ymm3	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm2,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm3,%%ymm5,%%ymm5	\n\t"\
	"movq	%[__cc1],%%rax		\n\t"\
		"vsubpd	%%ymm4,%%ymm6,%%ymm6	\n\t"\
		"vsubpd	%%ymm5,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm4,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm5,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm6,%%ymm4,%%ymm4	\n\t"\
		"vaddpd	%%ymm7,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rsi),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rsi),%%ymm5,%%ymm5	\n\t"\
	"vmovaps	%%ymm4,    (%%rdi)	\n\t"\
	"vmovaps	%%ymm5,0x20(%%rdi)	\n\t"\
		"vmulpd	0x20(%%rax),%%ymm6,%%ymm6	\n\t"\
		"vmulpd	0x20(%%rax),%%ymm7,%%ymm7	\n\t"\
		"vsubpd	    (%%rsi),%%ymm4,%%ymm4	\n\t"\
		"vsubpd	0x20(%%rsi),%%ymm5,%%ymm5	\n\t"\
		"vmulpd	    (%%rax),%%ymm4,%%ymm4	\n\t"\
		"vmulpd	    (%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	    (%%rdi),%%ymm4,%%ymm4	\n\t"\
		"vaddpd	0x20(%%rdi),%%ymm5,%%ymm5	\n\t"\
		"vsubpd	%%ymm6,%%ymm4,%%ymm4	\n\t"\
		"vsubpd	%%ymm7,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm6,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm7,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm4,%%ymm6,%%ymm6	\n\t"\
		"vaddpd	%%ymm5,%%ymm7,%%ymm7	\n\t"\
		"vmovaps	%%ymm4,    (%%rsi)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rsi)	\n\t"\
		"vmovaps	%%ymm0,%%ymm4		\n\t"\
		"vmovaps	%%ymm1,%%ymm5		\n\t"\
		"vsubpd	%%ymm2,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm3,%%ymm1,%%ymm1	\n\t"\
		"vmulpd	0x40(%%rax),%%ymm0,%%ymm0	\n\t"\
		"vmulpd	0x40(%%rax),%%ymm1,%%ymm1	\n\t"\
		"vmulpd	0x60(%%rax),%%ymm2,%%ymm2	\n\t"\
		"vmulpd	0x60(%%rax),%%ymm3,%%ymm3	\n\t"\
		"vmulpd	0x80(%%rax),%%ymm4,%%ymm4	\n\t"\
		"vmulpd	0x80(%%rax),%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm0,%%ymm2,%%ymm2	\n\t"\
		"vaddpd	%%ymm1,%%ymm3,%%ymm3	\n\t"\
		"vsubpd	%%ymm4,%%ymm0,%%ymm0	\n\t"\
		"vsubpd	%%ymm5,%%ymm1,%%ymm1	\n\t"\
		"vmovaps	    (%%rsi),%%ymm4	\n\t"\
		"vmovaps	0x20(%%rsi),%%ymm5	\n\t"\
	"movq	%[__o1],%%rax		\n\t"\
	"movq	%[__o4],%%rdx		\n\t"\
		"vsubpd	%%ymm3,%%ymm6,%%ymm6	\n\t"\
		"vsubpd	%%ymm2,%%ymm7,%%ymm7	\n\t"\
		"vaddpd	%%ymm3,%%ymm3,%%ymm3	\n\t"\
		"vaddpd	%%ymm2,%%ymm2,%%ymm2	\n\t"\
		"vmovaps	%%ymm6,    (%%rax)	\n\t"\
		"vmovaps	%%ymm7,0x20(%%rdx)	\n\t"\
		"vaddpd	%%ymm6,%%ymm3,%%ymm3	\n\t"\
		"vaddpd	%%ymm7,%%ymm2,%%ymm2	\n\t"\
		"vmovaps	%%ymm3,    (%%rdx)	\n\t"\
		"vmovaps	%%ymm2,0x20(%%rax)	\n\t"\
	"movq	%[__o2],%%rbx		\n\t"\
	"movq	%[__o3],%%rcx		\n\t"\
		"vsubpd	%%ymm1,%%ymm4,%%ymm4	\n\t"\
		"vsubpd	%%ymm0,%%ymm5,%%ymm5	\n\t"\
		"vaddpd	%%ymm1,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm0,%%ymm0,%%ymm0	\n\t"\
		"vmovaps	%%ymm4,    (%%rbx)	\n\t"\
		"vmovaps	%%ymm5,0x20(%%rcx)	\n\t"\
		"vaddpd	%%ymm4,%%ymm1,%%ymm1	\n\t"\
		"vaddpd	%%ymm5,%%ymm0,%%ymm0	\n\t"\
		"vmovaps	%%ymm1,    (%%rcx)	\n\t"\
		"vmovaps	%%ymm0,0x20(%%rbx)	\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

#elif defined(USE_SSE2)

/*** Prefetch even-index iaddresses in DIT below, odd-index oaddresses in SSE2_RADIX16_DIF_TWIDDLE_OOP ***/

	// Based on the SSE2_RADIX16_DIT_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-input addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	// We use just a single output base-pointer plus literal ostrides which are [1,2,3,4]-multiples of
	// __01; this allows us to cut GP-register usage, which is absolutely a must for the 32-bit version
	// of the macro, and is a benefit to the 64-bit versions which code-fold to yield 2 side-by-side
	// streams of independently executable instructions, one for data in xmm0-7, the other using xmm8-15.
	/* Dec 2020: Needed to cut #args for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid
	multiple versions of the macro having different arglists. Replace 16 I-addresses with I-base-address
	in0 and pointer to array of 16 int offset-indices: */
	#define SSE2_RADIX16_DIT_0TWIDDLE(Xin0,Xoff, Xisrt2,Xtwo, Xout0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
	"movq	%[in0],%%r8	\n\t	movq	%[off],%%r9	\n\t"/* Load output base-address into r8 and int32[16] offset-array pointer into r9 */\
		"movslq		    (%%r9),%%rax	\n\t"/* off[0-3] */\
		"movslq		0x04(%%r9),%%rbx	\n\t"\
		"movslq		0x08(%%r9),%%rcx	\n\t"\
		"movslq		0x0c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* in0 + off[0-3] */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
	"prefetcht1	0x100(%%rax)\n\t"\
	/* SSE2_RADIX4_DIT_0TWIDDLE_B(r0 ): */\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"movaps	    (%%rbx),%%xmm0	\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm1	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	%%xmm0,%%xmm2		\n\t"\
		"subpd	%%xmm4,%%xmm6		\n\t"\
		"subpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm5,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm0		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm1		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm0		\n\t"\
		"addpd	%%xmm6,%%xmm4		\n\t"\
		"addpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm7,%%xmm5		\n\t"\
		"movq	%[__out0],%%rsi		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
	"prefetcht1	0x100(%%rcx)\n\t"\
		"leaq	0x10(%%rsi),%%rdi	\n\t"/* Need separate address Im parts of outputs due to literal-offsets below */\
		"movaps	%%xmm0,%c[__o2](%%rsi)	\n\t"\
		"movaps	%%xmm2,%c[__o3](%%rsi)	\n\t"\
		"movaps	%%xmm1,%c[__o2](%%rdi)	\n\t"\
		"movaps	%%xmm3,%c[__o1](%%rdi)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm4,        (%%rsi)	\n\t"\
		"movaps	%%xmm7,%c[__o1](%%rsi)	\n\t"\
		"movaps	%%xmm5,        (%%rdi)	\n\t"\
		"movaps	%%xmm6,%c[__o3](%%rdi)	\n\t"\
	/* SSE2_RADIX4_DIT_0TWIDDLE_B(r8 ): */\
		"movslq		0x10(%%r9),%%rax	\n\t"/* off[4-7] */\
		"movslq		0x14(%%r9),%%rbx	\n\t"\
		"movslq		0x18(%%r9),%%rcx	\n\t"\
		"movslq		0x1c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* in0 + off[4-7] */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__o4],%%rsi		\n\t"/* __out0 + 4*ostride */\
	"prefetcht1	0x100(%%rax)\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"movaps	    (%%rbx),%%xmm0	\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm1	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	%%xmm0,%%xmm2		\n\t"\
		"subpd	%%xmm4,%%xmm6		\n\t"\
		"subpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm5,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm0		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm1		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm0		\n\t"\
		"addpd	%%xmm6,%%xmm4		\n\t"\
		"addpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm7,%%xmm5		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
	"prefetcht1	0x100(%%rcx)\n\t"\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	%%xmm0,%c[__o2](%%rsi)	\n\t"\
		"movaps	%%xmm2,%c[__o3](%%rsi)	\n\t"\
		"movaps	%%xmm1,%c[__o2](%%rdi)	\n\t"\
		"movaps	%%xmm3,%c[__o1](%%rdi)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm4,        (%%rsi)	\n\t"\
		"movaps	%%xmm7,%c[__o1](%%rsi)	\n\t"\
		"movaps	%%xmm5,        (%%rdi)	\n\t"\
		"movaps	%%xmm6,%c[__o3](%%rdi)	\n\t"\
	/* SSE2_RADIX4_DIT_0TWIDDLE_B(r16): */\
		"movslq		0x20(%%r9),%%rax	\n\t"/* off[8-b] */\
		"movslq		0x24(%%r9),%%rbx	\n\t"\
		"movslq		0x28(%%r9),%%rcx	\n\t"\
		"movslq		0x2c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* in0 + off[8-b] */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__o4],%%rsi		\n\t"/* __out0 + 8*ostride */\
	"prefetcht1	0x100(%%rax)\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"movaps	    (%%rbx),%%xmm0	\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm1	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	%%xmm0,%%xmm2		\n\t"\
		"subpd	%%xmm4,%%xmm6		\n\t"\
		"subpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm5,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm0		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm1		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm0		\n\t"\
		"addpd	%%xmm6,%%xmm4		\n\t"\
		"addpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm7,%%xmm5		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
	"prefetcht1	0x100(%%rcx)\n\t"\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	%%xmm0,%c[__o2](%%rsi)	\n\t"\
		"movaps	%%xmm2,%c[__o3](%%rsi)	\n\t"\
		"movaps	%%xmm1,%c[__o2](%%rdi)	\n\t"\
		"movaps	%%xmm3,%c[__o1](%%rdi)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm4,        (%%rsi)	\n\t"\
		"movaps	%%xmm7,%c[__o1](%%rsi)	\n\t"\
		"movaps	%%xmm5,        (%%rdi)	\n\t"\
		"movaps	%%xmm6,%c[__o3](%%rdi)	\n\t"\
	/* SSE2_RADIX4_DIT_0TWIDDLE_B(r24): */\
		"movslq		0x30(%%r9),%%rax	\n\t"/* off[c-f] */\
		"movslq		0x34(%%r9),%%rbx	\n\t"\
		"movslq		0x38(%%r9),%%rcx	\n\t"\
		"movslq		0x3c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* in0 + off[c-f] */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__o4],%%rsi		\n\t"/* __out0 + c*ostride */\
	"prefetcht1	0x100(%%rax)\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"movaps	    (%%rbx),%%xmm0	\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm1	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	%%xmm0,%%xmm2		\n\t"\
		"subpd	%%xmm4,%%xmm6		\n\t"\
		"subpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm5,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm0		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm1		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm0		\n\t"\
		"addpd	%%xmm6,%%xmm4		\n\t"\
		"addpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm7,%%xmm5		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
	"prefetcht1	0x100(%%rcx)\n\t"\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	%%xmm0,%c[__o2](%%rsi)	\n\t"\
		"movaps	%%xmm2,%c[__o3](%%rsi)	\n\t"\
		"movaps	%%xmm1,%c[__o2](%%rdi)	\n\t"\
		"movaps	%%xmm3,%c[__o1](%%rdi)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm4,        (%%rsi)	\n\t"\
		"movaps	%%xmm7,%c[__o1](%%rsi)	\n\t"\
		"movaps	%%xmm5,        (%%rdi)	\n\t"\
		"movaps	%%xmm6,%c[__o3](%%rdi)	\n\t"\
	/*** Now do 4 DFTs with internal twiddles on the 4*stride - separated data: ***/\
		"movq	%[__out0],%%rax		\n\t"\
		"leaq	%c[__o4](%%rax),%%rbx	\n\t"/* __out0 +   [4*ostride] */\
		"leaq	%c[__o4](%%rbx),%%rcx	\n\t"/* __out0 + 2*[4*ostride] */\
		"leaq	%c[__o4](%%rcx),%%rdx	\n\t"/* __out0 + 3*[4*ostride] */\
		/* Block 0: */\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	    (%%rbx),%%xmm0	\n\t"\
		"subpd	0x10(%%rbx),%%xmm1	\n\t"\
		"addpd	    (%%rax),%%xmm2	\n\t"\
		"addpd	0x10(%%rax),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	    (%%rdx),%%xmm4	\n\t"\
		"subpd	0x10(%%rdx),%%xmm5	\n\t"\
		"addpd	    (%%rcx),%%xmm6	\n\t"\
		"addpd	0x10(%%rcx),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm2,%%xmm6		\n\t"\
		"addpd	%%xmm3,%%xmm7		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"subpd	%%xmm5,%%xmm0		\n\t"\
		"subpd	%%xmm4,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%rdx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm4		\n\t"\
		"movaps	%%xmm5,    (%%rbx)	\n\t"\
		"movaps	%%xmm4,0x10(%%rdx)	\n\t"\
		/* Block 2: */\
		"addq	$%c[__o2],%%rax		\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__o2],%%rbx		\n\t"\
		"addq	$%c[__o2],%%rcx		\n\t"\
		"addq	$%c[__o2],%%rdx		\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"movaps	(%%rdi),%%xmm2		\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm0	\n\t"\
		"movaps	0x10(%%rdx),%%xmm1	\n\t"\
		"addpd	0x10(%%rcx),%%xmm4	\n\t"\
		"subpd	    (%%rcx),%%xmm5	\n\t"\
		"subpd	0x10(%%rdx),%%xmm0	\n\t"\
		"addpd	    (%%rdx),%%xmm1	\n\t"\
		"mulpd	%%xmm2,%%xmm4		\n\t"\
		"mulpd	%%xmm2,%%xmm5		\n\t"\
		"mulpd	%%xmm2,%%xmm0		\n\t"\
		"mulpd	%%xmm2,%%xmm1		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"subpd	%%xmm0,%%xmm4		\n\t"\
		"subpd	%%xmm1,%%xmm5		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	0x10(%%rbx),%%xmm0	\n\t"\
		"subpd	    (%%rbx),%%xmm1	\n\t"\
		"addpd	    (%%rax),%%xmm3	\n\t"\
		"addpd	0x10(%%rax),%%xmm2	\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm3,    (%%rcx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm0		\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t"\
		"movaps	%%xmm0,    (%%rdx)	\n\t"\
		"movaps	%%xmm2,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm7		\n\t"\
		"addpd	%%xmm2,%%xmm6		\n\t"\
		"movaps	%%xmm7,    (%%rbx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rdx)	\n\t"\
		/* Block 1: */\
		"subq	$%c[__o1],%%rax		\n\t"/* All subresses -= 1*ostride */\
		"subq	$%c[__o1],%%rbx		\n\t"\
		"subq	$%c[__o1],%%rcx		\n\t"\
		"subq	$%c[__o1],%%rdx		\n\t"\
		"leaq	0x10(%%rdi),%%rsi	\n\t"/* cc0 */\
		"movaps	    (%%rdx),%%xmm0	\n\t"\
		"movaps	0x10(%%rdx),%%xmm1	\n\t"\
		"movaps	    (%%rdx),%%xmm2	\n\t"\
		"movaps	0x10(%%rdx),%%xmm3	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm0	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm1	\n\t"\
		"mulpd	    (%%rsi),%%xmm2	\n\t"\
		"mulpd	    (%%rsi),%%xmm3	\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t"\
		"addpd	%%xmm3,%%xmm0		\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"subpd	%%xmm0,%%xmm6		\n\t"\
		"subpd	%%xmm1,%%xmm7		\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"addpd	0x10(%%rbx),%%xmm2	\n\t"\
		"subpd	    (%%rbx),%%xmm3	\n\t"\
		"mulpd	    (%%rdi),%%xmm2	\n\t"\
		"mulpd	    (%%rdi),%%xmm3	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm4		\n\t"\
		"addpd	%%xmm3,%%xmm5		\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm0		\n\t"\
		"subpd	%%xmm6,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%rdx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm7		\n\t"\
		"addpd	%%xmm1,%%xmm6		\n\t"\
		"movaps	%%xmm7,    (%%rbx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rdx)	\n\t"\
		/* Block 3: */\
		"addq	$%c[__o2],%%rax		\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__o2],%%rbx		\n\t"\
		"addq	$%c[__o2],%%rcx		\n\t"\
		"addq	$%c[__o2],%%rdx		\n\t"\
		"movaps	    (%%rdx),%%xmm0	\n\t"\
		"movaps	0x10(%%rdx),%%xmm1	\n\t"\
		"movaps	    (%%rdx),%%xmm2	\n\t"\
		"movaps	0x10(%%rdx),%%xmm3	\n\t"\
		"mulpd	    (%%rsi),%%xmm0	\n\t"\
		"mulpd	    (%%rsi),%%xmm1	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm2	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm3	\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t"\
		"addpd	%%xmm3,%%xmm0		\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm4	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm5	\n\t"\
		"mulpd	    (%%rsi),%%xmm6	\n\t"\
		"mulpd	    (%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"subpd	%%xmm0,%%xmm6		\n\t"\
		"subpd	%%xmm1,%%xmm7		\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"subpd	0x10(%%rbx),%%xmm2	\n\t"\
		"addpd	    (%%rbx),%%xmm3	\n\t"\
		"mulpd	    (%%rdi),%%xmm2	\n\t"\
		"mulpd	    (%%rdi),%%xmm3	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%rcx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%rdx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm5,    (%%rbx)	\n\t"\
		"movaps	%%xmm4,0x10(%%rdx)	\n\t"\
		:					/* outputs: none */\
		:[in0] "m" (Xin0)	/* Input-address-16-tet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 16 double* index offsets */\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		,[__o1] "e" (Xo1)\
		,[__o2] "e" (Xo2)\
		,[__o3] "e" (Xo3)\
		,[__o4] "e" (Xo4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"	/* Clobbered registers */\
	);\
	}

	// Based on the SSE2_RADIX16_DIF_NOTWIDDLE macro in radix16_ditN_cy_dif1_gcc64.h, but with completely
	// specifiable 16-output addressing required for usage as the power-of-2 component of a twiddleless
	// radix = [odd*2^n] DFT routine.
	/* Dec 2020: Needed to cut #args for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid
	multiple versions of the macro having different arglists. Replace 16 O-addresses with O-base-address
	out0 and pointer to array of 16 int offset-indices: */
	#define SSE2_RADIX16_DIF_0TWIDDLE(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0,Xoff)\
	{\
	__asm__ volatile (\
	/* SSE2_RADIX4_DIF_IN_PLACE(r1 , r17, r9 , r25): */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rcx	\n\t"/* __in0 +   [4*istride]; note BR of [a,b,c,d]-ptrs, i.e. b/c swap */\
		"leaq	%c[__i4](%%rcx),%%rbx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
	/* SSE2_RADIX4_DIF_IN_PLACE(r5 , r21, r13, r29): */\
		"addq	$%c[__i2],%%rax	\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__i2],%%rbx	\n\t"\
		"addq	$%c[__i2],%%rcx	\n\t"\
		"addq	$%c[__i2],%%rdx	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
	/* SSE2_RADIX4_DIF_IN_PLACE(r3 , r19, r11, r27): */\
		"subq	$%c[__i1],%%rax	\n\t"/* All addresses -= 1*ostride */\
		"subq	$%c[__i1],%%rbx	\n\t"\
		"subq	$%c[__i1],%%rcx	\n\t"\
		"subq	$%c[__i1],%%rdx	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
	/* SSE2_RADIX4_DIF_IN_PLACE(r7 , r23, r15, r31): */\
		"addq	$%c[__i2],%%rax	\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__i2],%%rbx	\n\t"\
		"addq	$%c[__i2],%%rcx	\n\t"\
		"addq	$%c[__i2],%%rdx	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
	/*****************************************************************************************
	**** Now do 4 DFTs with internal twiddles on the 1*stride - separated data. Do blocks ****
	**** in order 0,2,1,3 to allow increment-only of rsi-datum from 1 block to the next:  ****
	*****************************************************************************************/\
	"movq	%[__in0],%%rsi	\n\t"\
	"movq	%[out0],%%r8	\n\t	movq	%[off],%%r9	\n\t"/* Load output base-address into r8 and offset-array pointer into r9 */\
	/* Block 0: r0-3 */\
		"movslq		    (%%r9),%%rax	\n\t"/*        off0 */\
		"movslq		0x04(%%r9),%%rbx	\n\t"/*        off1 */\
		"movslq		0x08(%%r9),%%rcx	\n\t"/*        off2 */\
		"movslq		0x0c(%%r9),%%rdx	\n\t"/*        off3 */\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"/* out0 + off0 */\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"/* out0 + off1 */\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"/* out0 + off2 */\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"/* out0 + off3 */\
		"leaq	0x10(%%rsi),%%rdi	\n\t"/* Need separate address Im parts of outputs due to literal-offsets below */\
		"movaps	        (%%rsi),%%xmm0	\n\t"\
		"movaps	        (%%rdi),%%xmm1	\n\t"\
		"movaps	%c[__i2](%%rsi),%%xmm2	\n\t"\
		"movaps	%c[__i2](%%rdi),%%xmm3	\n\t"\
		"subpd	%c[__i2](%%rsi),%%xmm0	\n\t"\
		"subpd	%c[__i2](%%rdi),%%xmm1	\n\t"\
		"addpd	        (%%rsi),%%xmm2	\n\t"\
		"addpd	        (%%rdi),%%xmm3	\n\t"\
		"movaps	%c[__i1](%%rsi),%%xmm4	\n\t"\
		"movaps	%c[__i1](%%rdi),%%xmm5	\n\t"\
		"movaps	%c[__i3](%%rsi),%%xmm6	\n\t"\
		"movaps	%c[__i3](%%rdi),%%xmm7	\n\t"\
		"subpd	%c[__i3](%%rsi),%%xmm4	\n\t"\
		"subpd	%c[__i3](%%rdi),%%xmm5	\n\t"\
		"addpd	%c[__i1](%%rsi),%%xmm6	\n\t"\
		"addpd	%c[__i1](%%rdi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm3		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"movaps	%%xmm2,    (%%rbx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm2,		%%xmm6	\n\t"\
		"addpd	%%xmm3,		%%xmm7	\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"subpd	%%xmm5,		%%xmm0	\n\t"\
		"subpd	%%xmm4,		%%xmm1	\n\t"\
		"addpd	%%xmm5,		%%xmm5	\n\t"\
		"addpd	%%xmm4,		%%xmm4	\n\t"\
		"movaps	%%xmm0,    (%%rcx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm0,		%%xmm5	\n\t"\
		"addpd	%%xmm1,		%%xmm4	\n\t"\
		"movaps	%%xmm5,    (%%rdx)	\n\t"\
		"movaps	%%xmm4,0x10(%%rcx)	\n\t"\
	/* Block 2: */\
		"movslq		0x20(%%r9),%%rax	\n\t"/* off8-b */\
		"movslq		0x24(%%r9),%%rbx	\n\t"\
		"movslq		0x28(%%r9),%%rcx	\n\t"\
		"movslq		0x2c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__i4],%%rsi	\n\t"/* __in0 + 4*ostride */\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	%c[__i1](%%rsi),%%xmm4	\n\t"\
		"movaps	%c[__i3](%%rsi),%%xmm6	\n\t"\
		"movaps	%c[__i1](%%rdi),%%xmm5	\n\t"\
		"movaps	%c[__i3](%%rdi),%%xmm7	\n\t"\
		"movaps	%%xmm4,%%xmm0		\n\t"\
		"movaps	%%xmm6,%%xmm2		\n\t"\
		"movaps	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm7,%%xmm3		\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"addq	$0x10,%%rdi	\n\t"/* cc0 */\
		"mulpd	    (%%rdi),%%xmm4	\n\t"\
		"mulpd	0x10(%%rdi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rdi),%%xmm1	\n\t"\
		"mulpd	    (%%rdi),%%xmm3	\n\t"\
		"mulpd	    (%%rdi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rdi),%%xmm7	\n\t"\
		"mulpd	0x10(%%rdi),%%xmm0	\n\t"\
		"mulpd	    (%%rdi),%%xmm2	\n\t"\
		"subpd	%%xmm1,%%xmm4		\n\t"\
		"subpd	%%xmm3,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t"\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	%c[__i2](%%rsi),%%xmm2	\n\t"\
		"movaps	%c[__i2](%%rdi),%%xmm3	\n\t"\
		"subpd	%c[__i2](%%rdi),%%xmm2	\n\t"\
		"addpd	%c[__i2](%%rsi),%%xmm3	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"mulpd	(%%rdi),%%xmm2	\n\t"/* mul by isrt2 */\
		"mulpd	(%%rdi),%%xmm3	\n\t"\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	        (%%rsi),%%xmm0	\n\t"\
		"movaps	        (%%rdi),%%xmm1	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	        (%%rsi),%%xmm2	\n\t"\
		"addpd	        (%%rdi),%%xmm3	\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm3		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"movaps	%%xmm2,    (%%rbx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm2,		%%xmm6	\n\t"\
		"addpd	%%xmm3,		%%xmm7	\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"subpd	%%xmm5,		%%xmm0	\n\t"\
		"subpd	%%xmm4,		%%xmm1	\n\t"\
		"addpd	%%xmm5,		%%xmm5	\n\t"\
		"addpd	%%xmm4,		%%xmm4	\n\t"\
		"movaps	%%xmm0,    (%%rcx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm0,		%%xmm5	\n\t"\
		"addpd	%%xmm1,		%%xmm4	\n\t"\
		"movaps	%%xmm5,    (%%rdx)	\n\t"\
		"movaps	%%xmm4,0x10(%%rcx)	\n\t"\
	/* Block 1: r8-b */\
		"movslq		0x10(%%r9),%%rax	\n\t"/* off4-7 */\
		"movslq		0x14(%%r9),%%rbx	\n\t"\
		"movslq		0x18(%%r9),%%rcx	\n\t"\
		"movslq		0x1c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__i4],%%rsi	\n\t"/* __in0 + 8*ostride */\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	        (%%rsi),%%xmm0	\n\t"\
		"movaps	        (%%rdi),%%xmm1	\n\t"\
		"movaps	%c[__i2](%%rsi),%%xmm2	\n\t"\
		"movaps	%c[__i2](%%rdi),%%xmm3	\n\t"\
		"subpd	%c[__i2](%%rdi),%%xmm0	\n\t"\
		"subpd	%c[__i2](%%rsi),%%xmm1	\n\t"\
		"addpd	        (%%rdi),%%xmm2	\n\t"\
		"addpd	        (%%rsi),%%xmm3	\n\t"\
		"movaps	%c[__i1](%%rsi),%%xmm4	\n\t"\
		"movaps	%c[__i1](%%rdi),%%xmm5	\n\t"\
		"movaps	%c[__i3](%%rsi),%%xmm6	\n\t"\
		"movaps	%c[__i3](%%rdi),%%xmm7	\n\t"\
		"subpd	%c[__i1](%%rdi),%%xmm4	\n\t"\
		"addpd	%c[__i1](%%rsi),%%xmm5	\n\t"\
		"addpd	%c[__i3](%%rdi),%%xmm6	\n\t"\
		"subpd	%c[__i3](%%rsi),%%xmm7	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"mulpd	(%%rdi),%%xmm4		\n\t"\
		"mulpd	(%%rdi),%%xmm5		\n\t"\
		"mulpd	(%%rdi),%%xmm6		\n\t"\
		"mulpd	(%%rdi),%%xmm7		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm2,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm0,		%%xmm4	\n\t"\
		"addpd	%%xmm2,		%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,		%%xmm3	\n\t"\
		"subpd	%%xmm6,		%%xmm1	\n\t"\
		"addpd	%%xmm7,		%%xmm7	\n\t"\
		"addpd	%%xmm6,		%%xmm6	\n\t"\
		"movaps	%%xmm3,    (%%rcx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm3,		%%xmm7	\n\t"\
		"addpd	%%xmm1,		%%xmm6	\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
	/* Block 3: */\
		"movslq		0x30(%%r9),%%rax	\n\t"/* offc-f */\
		"movslq		0x34(%%r9),%%rbx	\n\t"\
		"movslq		0x38(%%r9),%%rcx	\n\t"\
		"movslq		0x3c(%%r9),%%rdx	\n\t"\
		"leaq	(%%r8,%%rax,8),%%rax	\n\t"\
		"leaq	(%%r8,%%rbx,8),%%rbx	\n\t"\
		"leaq	(%%r8,%%rcx,8),%%rcx	\n\t"\
		"leaq	(%%r8,%%rdx,8),%%rdx	\n\t"\
		"addq	$%c[__i4],%%rsi	\n\t"/* __in0 + c*ostride */\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	%c[__i1](%%rsi),%%xmm4	\n\t"\
		"movaps	%c[__i3](%%rsi),%%xmm6	\n\t"\
		"movaps	%c[__i1](%%rdi),%%xmm5	\n\t"\
		"movaps	%c[__i3](%%rdi),%%xmm7	\n\t"\
		"movaps	%%xmm4,%%xmm0		\n\t"\
		"movaps	%%xmm6,%%xmm2		\n\t"\
		"movaps	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm7,%%xmm3		\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"addq	$0x10,%%rdi	\n\t"/* cc0 */\
		"mulpd	0x10(%%rdi),%%xmm4	\n\t"\
		"mulpd	    (%%rdi),%%xmm6	\n\t"\
		"mulpd	    (%%rdi),%%xmm1	\n\t"\
		"mulpd	0x10(%%rdi),%%xmm3	\n\t"\
		"mulpd	0x10(%%rdi),%%xmm5	\n\t"\
		"mulpd	    (%%rdi),%%xmm7	\n\t"\
		"mulpd	    (%%rdi),%%xmm0	\n\t"\
		"mulpd	0x10(%%rdi),%%xmm2	\n\t"\
		"subpd	%%xmm1,%%xmm4		\n\t"\
		"subpd	%%xmm3,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t"\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	%c[__i2](%%rsi),%%xmm2	\n\t"\
		"movaps	%c[__i2](%%rdi),%%xmm3	\n\t"\
		"addpd	%c[__i2](%%rdi),%%xmm2	\n\t"\
		"subpd	%c[__i2](%%rsi),%%xmm3	\n\t"\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"mulpd	(%%rdi),%%xmm2	\n\t"/* mul by isrt2 */\
		"mulpd	(%%rdi),%%xmm3	\n\t"\
		"leaq	0x10(%%rsi),%%rdi	\n\t"\
		"movaps	        (%%rsi),%%xmm0	\n\t"\
		"movaps	        (%%rdi),%%xmm1	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	        (%%rsi),%%xmm2	\n\t"\
		"addpd	        (%%rdi),%%xmm3	\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm0,		%%xmm4	\n\t"\
		"addpd	%%xmm1,		%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,		%%xmm2	\n\t"\
		"subpd	%%xmm6,		%%xmm3	\n\t"\
		"addpd	%%xmm7,		%%xmm7	\n\t"\
		"addpd	%%xmm6,		%%xmm6	\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm2,		%%xmm7	\n\t"\
		"addpd	%%xmm3,		%%xmm6	\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "e" (Xi1)\
		,[__i2] "e" (Xi2)\
		,[__i3] "e" (Xi3)\
		,[__i4] "e" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[out0] "m" (Xout0) /* output-address-16-tet base pointer */\
		,[off] "m" (Xoff)	/* and pointer to uint32 array of 16 double* index offsets */\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// Same as above, but with specifiable I-addresses and regularly spaced O-addresses:
	#define SSE2_RADIX16_DIF_0TWIDDLE_B(Xin0,Xi1,Xi2,Xi3,Xi4, Xisrt2,Xtwo, Xout0)\
	{\
	__asm__ volatile (\
		/* SSE2_RADIX4_DIF_IN_PLACE(r1 , r17, r9 , r25): */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rcx	\n\t"/* __in0 +   [4*istride]; note BR of [a,b,c,d]-ptrs, i.e. b/c swap */\
		"leaq	%c[__i4](%%rcx),%%rbx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0	\n\t"\
		"subpd	%%xmm5,%%xmm1	\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4	\n\t"\
		"addpd	%%xmm5,%%xmm5	\n\t"\
		"addpd	%%xmm0,%%xmm4	\n\t"\
		"addpd	%%xmm1,%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm2	\n\t"\
		"subpd	%%xmm6,%%xmm3	\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm2,%%xmm7	\n\t"\
		"addpd	%%xmm3,%%xmm6	\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
		/* SSE2_RADIX4_DIF_IN_PLACE(r5 , r21, r13, r29): */\
		"addq	$%c[__i2],%%rax	\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__i2],%%rbx	\n\t"\
		"addq	$%c[__i2],%%rcx	\n\t"\
		"addq	$%c[__i2],%%rdx	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0	\n\t"\
		"subpd	%%xmm5,%%xmm1	\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4	\n\t"\
		"addpd	%%xmm5,%%xmm5	\n\t"\
		"addpd	%%xmm0,%%xmm4	\n\t"\
		"addpd	%%xmm1,%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm2	\n\t"\
		"subpd	%%xmm6,%%xmm3	\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm2,%%xmm7	\n\t"\
		"addpd	%%xmm3,%%xmm6	\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
		/* SSE2_RADIX4_DIF_IN_PLACE(r3 , r19, r11, r27): */\
		"subq	$%c[__i1],%%rax	\n\t"/* All addresses -= 1*ostride */\
		"subq	$%c[__i1],%%rbx	\n\t"\
		"subq	$%c[__i1],%%rcx	\n\t"\
		"subq	$%c[__i1],%%rdx	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0	\n\t"\
		"subpd	%%xmm5,%%xmm1	\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4	\n\t"\
		"addpd	%%xmm5,%%xmm5	\n\t"\
		"addpd	%%xmm0,%%xmm4	\n\t"\
		"addpd	%%xmm1,%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm2	\n\t"\
		"subpd	%%xmm6,%%xmm3	\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm2,%%xmm7	\n\t"\
		"addpd	%%xmm3,%%xmm6	\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
		/* SSE2_RADIX4_DIF_IN_PLACE(r7 , r23, r15, r31): */\
		"addq	$%c[__i2],%%rax	\n\t"/* All addresses += 2*ostride */\
		"addq	$%c[__i2],%%rbx	\n\t"\
		"addq	$%c[__i2],%%rcx	\n\t"\
		"addq	$%c[__i2],%%rdx	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rax),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm3	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0	\n\t"\
		"subpd	%%xmm5,%%xmm1	\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4	\n\t"\
		"addpd	%%xmm5,%%xmm5	\n\t"\
		"addpd	%%xmm0,%%xmm4	\n\t"\
		"addpd	%%xmm1,%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"subpd	%%xmm7,%%xmm2	\n\t"\
		"subpd	%%xmm6,%%xmm3	\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm2,%%xmm7	\n\t"\
		"addpd	%%xmm3,%%xmm6	\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
	/*** Now do 4 DFTs with internal twiddles on the 1*stride - separated data. Do blocks in order 0,2,1,3 to allow increment-only of rsi-datum from 1 block to the next: ***/\
		"movq	%[__isrt2],%%rdi	\n\t"\
		"movaps	(%%rdi),%%xmm10	\n\t"/* isrt2 */\
		/* Block 0: r0-3 */\
		"movq	%[__out0],%%rsi	\n\t"\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride; note BR of [a,b,c,d]-ptrs, i.e. b/c swap */\
		"leaq	%c[__i1](%%rcx),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i1](%%rbx),%%rdx	\n\t"/* __in0 + 3*istride */\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps		(%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd		(%%rbx),%%xmm0	\n\t"\
		"subpd	0x10(%%rbx),%%xmm1	\n\t"\
		"addpd	    (%%rax),%%xmm2	\n\t"\
		"addpd	0x10(%%rax),%%xmm3	\n\t"\
		"movaps		(%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd		(%%rdx),%%xmm4	\n\t"\
		"subpd	0x10(%%rdx),%%xmm5	\n\t"\
		"addpd	    (%%rcx),%%xmm6	\n\t"\
		"addpd	0x10(%%rcx),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm2	\n\t"\
		"subpd	%%xmm7,%%xmm3	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"movaps	%%xmm2,0x20(%%rsi)	\n\t"\
		"movaps	%%xmm3,0x30(%%rsi)	\n\t"\
		"addpd	%%xmm2,%%xmm6	\n\t"\
		"addpd	%%xmm3,%%xmm7	\n\t"\
		"movaps	%%xmm6,    (%%rsi)	\n\t"\
		"movaps	%%xmm7,0x10(%%rsi)	\n\t"\
		"subpd	%%xmm5,%%xmm0	\n\t"\
		"subpd	%%xmm4,%%xmm1	\n\t"\
		"addpd	%%xmm5,%%xmm5	\n\t"\
		"addpd	%%xmm4,%%xmm4	\n\t"\
		"movaps	%%xmm0,0x40(%%rsi)	\n\t"\
		"movaps	%%xmm1,0x70(%%rsi)	\n\t"\
		"addpd	%%xmm0,%%xmm5	\n\t"\
		"addpd	%%xmm1,%%xmm4	\n\t"\
		"movaps	%%xmm5,0x60(%%rsi)	\n\t"\
		"movaps	%%xmm4,0x50(%%rsi)	\n\t"\
		/* Block 2: */\
		"addq	$0x100,%%rsi	\n\t"\
		"addq	$%c[__i4],%%rax	\n\t"/* All addresses += 4*ostride */\
		"addq	$%c[__i4],%%rbx	\n\t"\
		"addq	$%c[__i4],%%rcx	\n\t"\
		"addq	$%c[__i4],%%rdx	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps		(%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"movaps	%%xmm4,%%xmm0		\n\t"\
		"movaps	%%xmm6,%%xmm2		\n\t"\
		"movaps	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm7,%%xmm3		\n\t"\
		"movaps	0x10(%%rdi),%%xmm8	\n\t"/* cc0 */\
		"movaps	0x20(%%rdi),%%xmm9	\n\t"/* ss0 */\
		"mulpd	%%xmm8,%%xmm4	\n\t"\
		"mulpd	%%xmm9,%%xmm6	\n\t"\
		"mulpd	%%xmm9,%%xmm1	\n\t"\
		"mulpd	%%xmm8,%%xmm3	\n\t"\
		"mulpd	%%xmm8,%%xmm5	\n\t"\
		"mulpd	%%xmm9,%%xmm7	\n\t"\
		"mulpd	%%xmm9,%%xmm0	\n\t"\
		"mulpd	%%xmm8,%%xmm2	\n\t"\
		"subpd	%%xmm1,%%xmm4	\n\t"\
		"subpd	%%xmm3,%%xmm6	\n\t"\
		"addpd	%%xmm0,%%xmm5	\n\t"\
		"addpd	%%xmm2,%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm4	\n\t"\
		"subpd	%%xmm7,%%xmm5	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm4,%%xmm6	\n\t"\
		"addpd	%%xmm5,%%xmm7	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	0x10(%%rbx),%%xmm2	\n\t"\
		"addpd		(%%rbx),%%xmm3	\n\t"\
		"mulpd	%%xmm10,%%xmm2	\n\t"/* mul by isrt2 */\
		"mulpd	%%xmm10,%%xmm3	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"subpd	%%xmm2,%%xmm0	\n\t"\
		"subpd	%%xmm3,%%xmm1	\n\t"\
		"addpd	    (%%rax),%%xmm2	\n\t"\
		"addpd	0x10(%%rax),%%xmm3	\n\t"\
		"subpd	%%xmm6,%%xmm2	\n\t"\
		"subpd	%%xmm7,%%xmm3	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"movaps	%%xmm2,0x20(%%rsi)	\n\t"\
		"movaps	%%xmm3,0x30(%%rsi)	\n\t"\
		"addpd	%%xmm2,%%xmm6	\n\t"\
		"addpd	%%xmm3,%%xmm7	\n\t"\
		"movaps	%%xmm6,    (%%rsi)	\n\t"\
		"movaps	%%xmm7,0x10(%%rsi)	\n\t"\
		"subpd	%%xmm5,%%xmm0	\n\t"\
		"subpd	%%xmm4,%%xmm1	\n\t"\
		"addpd	%%xmm5,%%xmm5	\n\t"\
		"addpd	%%xmm4,%%xmm4	\n\t"\
		"movaps	%%xmm0,0x40(%%rsi)	\n\t"\
		"movaps	%%xmm1,0x70(%%rsi)	\n\t"\
		"addpd	%%xmm0,%%xmm5	\n\t"\
		"addpd	%%xmm1,%%xmm4	\n\t"\
		"movaps	%%xmm5,0x60(%%rsi)	\n\t"\
		"movaps	%%xmm4,0x50(%%rsi)	\n\t"\
		/* Block 1: r8-b */\
		"subq	$0x80,%%rsi	\n\t"\
		"addq	$%c[__i4],%%rax	\n\t"/* All addresses += 4*ostride */\
		"addq	$%c[__i4],%%rbx	\n\t"\
		"addq	$%c[__i4],%%rcx	\n\t"\
		"addq	$%c[__i4],%%rdx	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps		(%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	0x10(%%rbx),%%xmm0	\n\t"\
		"subpd		(%%rbx),%%xmm1	\n\t"\
		"addpd	0x10(%%rax),%%xmm2	\n\t"\
		"addpd		(%%rax),%%xmm3	\n\t"\
		"movaps		(%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	0x10(%%rcx),%%xmm4	\n\t"\
		"addpd		(%%rcx),%%xmm5	\n\t"\
		"addpd	0x10(%%rdx),%%xmm6	\n\t"\
		"subpd		(%%rdx),%%xmm7	\n\t"\
		"mulpd	%%xmm10,%%xmm4	\n\t"\
		"mulpd	%%xmm10,%%xmm5	\n\t"\
		"mulpd	%%xmm10,%%xmm6	\n\t"\
		"mulpd	%%xmm10,%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm4	\n\t"\
		"subpd	%%xmm7,%%xmm5	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm4,%%xmm6	\n\t"\
		"addpd	%%xmm5,%%xmm7	\n\t"\
		"subpd	%%xmm4,%%xmm0	\n\t"\
		"subpd	%%xmm5,%%xmm2	\n\t"\
		"addpd	%%xmm4,%%xmm4	\n\t"\
		"addpd	%%xmm5,%%xmm5	\n\t"\
		"movaps	%%xmm0,0x20(%%rsi)	\n\t"\
		"movaps	%%xmm2,0x30(%%rsi)	\n\t"\
		"addpd	%%xmm0,%%xmm4	\n\t"\
		"addpd	%%xmm2,%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rsi)	\n\t"\
		"movaps	%%xmm5,0x10(%%rsi)	\n\t"\
		"subpd	%%xmm7,%%xmm3	\n\t"\
		"subpd	%%xmm6,%%xmm1	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"movaps	%%xmm3,0x40(%%rsi)	\n\t"\
		"movaps	%%xmm1,0x70(%%rsi)	\n\t"\
		"addpd	%%xmm3,%%xmm7	\n\t"\
		"addpd	%%xmm1,%%xmm6	\n\t"\
		"movaps	%%xmm7,0x60(%%rsi)	\n\t"\
		"movaps	%%xmm6,0x50(%%rsi)	\n\t"\
		/* Block 3: */\
		"addq	$0x100,%%rsi	\n\t"\
		"addq	$%c[__i4],%%rax	\n\t"/* All addresses += 4*ostride */\
		"addq	$%c[__i4],%%rbx	\n\t"\
		"addq	$%c[__i4],%%rcx	\n\t"\
		"addq	$%c[__i4],%%rdx	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps		(%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"movaps	%%xmm4,%%xmm0		\n\t"\
		"movaps	%%xmm6,%%xmm2		\n\t"\
		"movaps	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm7,%%xmm3		\n\t"\
		"mulpd	%%xmm9,%%xmm4	\n\t"\
		"mulpd	%%xmm8,%%xmm6	\n\t"\
		"mulpd	%%xmm8,%%xmm1	\n\t"\
		"mulpd	%%xmm9,%%xmm3	\n\t"\
		"mulpd	%%xmm9,%%xmm5	\n\t"\
		"mulpd	%%xmm8,%%xmm7	\n\t"\
		"mulpd	%%xmm8,%%xmm0	\n\t"\
		"mulpd	%%xmm9,%%xmm2	\n\t"\
		"subpd	%%xmm1,%%xmm4	\n\t"\
		"subpd	%%xmm3,%%xmm6	\n\t"\
		"addpd	%%xmm0,%%xmm5	\n\t"\
		"addpd	%%xmm2,%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm4	\n\t"\
		"subpd	%%xmm7,%%xmm5	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm4,%%xmm6	\n\t"\
		"addpd	%%xmm5,%%xmm7	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"addpd	0x10(%%rbx),%%xmm2	\n\t"\
		"subpd	    (%%rbx),%%xmm3	\n\t"\
		"mulpd	%%xmm10,%%xmm2	\n\t"/* mul by isrt2 */\
		"mulpd	%%xmm10,%%xmm3	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"subpd	%%xmm2,%%xmm0	\n\t"\
		"subpd	%%xmm3,%%xmm1	\n\t"\
		"addpd	    (%%rax),%%xmm2	\n\t"\
		"addpd	0x10(%%rax),%%xmm3	\n\t"\
		"subpd	%%xmm4,%%xmm0	\n\t"\
		"subpd	%%xmm5,%%xmm1	\n\t"\
		"addpd	%%xmm4,%%xmm4	\n\t"\
		"addpd	%%xmm5,%%xmm5	\n\t"\
		"movaps	%%xmm0,0x20(%%rsi)	\n\t"\
		"movaps	%%xmm1,0x30(%%rsi)	\n\t"\
		"addpd	%%xmm0,%%xmm4	\n\t"\
		"addpd	%%xmm1,%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rsi)	\n\t"\
		"movaps	%%xmm5,0x10(%%rsi)	\n\t"\
		"subpd	%%xmm7,%%xmm2	\n\t"\
		"subpd	%%xmm6,%%xmm3	\n\t"\
		"addpd	%%xmm7,%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm6	\n\t"\
		"movaps	%%xmm2,0x40(%%rsi)	\n\t"\
		"movaps	%%xmm3,0x70(%%rsi)	\n\t"\
		"addpd	%%xmm2,%%xmm7	\n\t"\
		"addpd	%%xmm3,%%xmm6	\n\t"\
		"movaps	%%xmm7,0x60(%%rsi)	\n\t"\
		"movaps	%%xmm6,0x50(%%rsi)	\n\t"\
		:					/* outputs: none */\
		:[__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		,[__i1] "e" (Xi1)\
		,[__i2] "e" (Xi2)\
		,[__i3] "e" (Xi3)\
		,[__i4] "e" (Xi4)\
		,[__isrt2] "m" (Xisrt2)\
		,[__two] "m" (Xtwo)\
		,[__out0] "m" (Xout0)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10"		/* Clobbered registers */\
	);\
	}

	/* With-twiddles out-of-place analog of above twiddleless DIT macro: 15 nontrivial complex input twiddles E1-f [E0 assumed = 1],
	The DIT version of this macro processes the twiddles in-order.
	NOTE: SINCE THIS MACRO IS SPECIFICALLY DESIGNED AS THE 2ND-PASS OF LARGE-POWER-OF-2-TWIDDLELESS DFT SYNTHESIS, THE
	"TWIDDLES" HERE ARE PURE OF THE DFT-INTERNAL VARIETY, AND THUS APPLIED TO THE INPUTS, JUST AS FOR THE ABOVE DIF COUNTERPART.

	Sincos layout: Two portions:

	Radix-16 shared consts anchored at isrt2:

	  isrt2 + 0x000;	cc0 + 0x010;	ss0 + 0x020;

	Per-block-specific set of 15 complex twiddles anchored at c1:

		c1  + 0x000;	s1  + 0x010;
		c2  + 0x020;	s2  + 0x030;
		c3  + 0x040;	s3  + 0x050;
		c4  + 0x060;	s4  + 0x070;
		c5  + 0x080;	s5  + 0x090;
		c6  + 0x0a0;	s6  + 0x0b0;
		c7  + 0x0c0;	s7  + 0x0d0;
		c8  + 0x0e0;	s8  + 0x0f0;
		c9  + 0x100;	s9  + 0x110;
		c10 + 0x120;	s10 + 0x130;
		c11 + 0x140;	s11 + 0x150;
		c12 + 0x160;	s12 + 0x170;
		c13 + 0x180;	s13 + 0x190;
		c14 + 0x1a0;	s14 + 0x1b0;
		c15 + 0x1c0;	s15 + 0x1d0;

	Use radix-16 DIF as template for DIT/OOP here, since need a pre-twiddles algorithm:
	*/
	#define SSE2_RADIX16_DIT_TWIDDLE_OOP(Xin0,Xi1,Xi2,Xi3,Xi4, Xout0,Xo1,Xo2,Xo3,Xo4, Xisrt2,Xc1)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i2](%%rax),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i3](%%rax),%%rdx	\n\t"/* __in0 + 3*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"movq	%[__c1],%%rsi 	/* c1 */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	    (%%rsi),%%xmm4	\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x20,%%rsi 	/* c2,3 */\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* c3 */\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	/* c2 */\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"addpd	    (%%rax),%%xmm6	\n\t"\
		"addpd	0x10(%%rax),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into temporary-array slots: */\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		/* DIT has outputs (indexed in real-temp form as 0-7) 2/6,3/7 swapped, i.e. swap oregs c/d vs DIF: */\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm2,    (%%rdx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm5,    (%%rcx)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,0x10(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* c4,5 */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rsi),%%xmm6	\n\t"\
		"movaps	0x10(%%rsi),%%xmm7	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"mulpd	%%xmm6,%%xmm0		/* c4 */\n\t"\
		"mulpd	%%xmm6,%%xmm1		\n\t"\
		"mulpd	%%xmm7,%%xmm2		\n\t"\
		"mulpd	%%xmm7,%%xmm3		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* c5 */\n\t"\
		"addpd	%%xmm3,%%xmm0		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* c6,7 */\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* c7 */\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	/* c6 */\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"addpd	    (%%rax),%%xmm6	\n\t"\
		"addpd	0x10(%%rax),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into temporary-array slots: */\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm2,    (%%rdx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm5,    (%%rcx)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,0x10(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* c8,9 */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rsi),%%xmm6	\n\t"\
		"movaps	0x10(%%rsi),%%xmm7	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"mulpd	%%xmm6,%%xmm0		/* c8 */\n\t"\
		"mulpd	%%xmm6,%%xmm1		\n\t"\
		"mulpd	%%xmm7,%%xmm2		\n\t"\
		"mulpd	%%xmm7,%%xmm3		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* c9 */\n\t"\
		"addpd	%%xmm3,%%xmm0		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* ca,b */\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* cb */\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	/* ca */\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"addpd	    (%%rax),%%xmm6	\n\t"\
		"addpd	0x10(%%rax),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into temporary-array slots: */\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm2,    (%%rdx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm5,    (%%rcx)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,0x10(%%rdx)	\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* cc,d */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rsi),%%xmm6	\n\t"\
		"movaps	0x10(%%rsi),%%xmm7	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"mulpd	%%xmm6,%%xmm0		/* cc */\n\t"\
		"mulpd	%%xmm6,%%xmm1		\n\t"\
		"mulpd	%%xmm7,%%xmm2		\n\t"\
		"mulpd	%%xmm7,%%xmm3		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* cd */\n\t"\
		"addpd	%%xmm3,%%xmm0		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* ce,f */\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* cf */\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	/* ce */\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"addpd	    (%%rax),%%xmm6	\n\t"\
		"addpd	0x10(%%rax),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into temporary-array slots: */\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm2,    (%%rdx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm5,    (%%rcx)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,0x10(%%rdx)	\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
		"movq	%[__isrt2],%%rsi 	\n\t"\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm3		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"movq	%[__out0],%%r10		\n\t"\
		"leaq	%c[__o4](%%r10),%%r11	\n\t"/* __out0 + 4*ostride */\
		"leaq	%c[__o4](%%r11),%%r12	\n\t"/* __out0 + 8*ostride */\
		"leaq	%c[__o4](%%r12),%%r13	\n\t"/* __out0 + c*ostride */\
		"movaps	%%xmm2,    (%%r12)	\n\t"\
		"movaps	%%xmm3,0x10(%%r12)	\n\t"\
		"addpd	%%xmm2,	%%xmm6	\n\t"\
		"addpd	%%xmm3,	%%xmm7	\n\t"\
		"movaps	%%xmm6,    (%%r10)	\n\t"\
		"movaps	%%xmm7,0x10(%%r10)	\n\t"\
		"subpd	%%xmm5,	%%xmm0	\n\t"\
		"subpd	%%xmm4,	%%xmm1	\n\t"\
		"addpd	%%xmm5,	%%xmm5	\n\t"\
		"addpd	%%xmm4,	%%xmm4	\n\t"\
		"movaps	%%xmm0,    (%%r13)	\n\t"/* These 2 outputs [4/c] swapped w.r.to dif [2/3] due to +-I sign diff */\
		"movaps	%%xmm1,0x10(%%r11)	\n\t"\
		"addpd	%%xmm0,	%%xmm5	\n\t"\
		"addpd	%%xmm1,	%%xmm4	\n\t"\
		"movaps	%%xmm5,    (%%r11)	\n\t"\
		"movaps	%%xmm4,0x10(%%r13)	\n\t"\
	/* Block 1: Combine 1-output of each radix-4, i.e. inputs from __in0 + [1,5,9,d]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
		"movaps	    (%%rdx),%%xmm0	\n\t"\
		"movaps	0x10(%%rdx),%%xmm1	\n\t"\
		"movaps	    (%%rdx),%%xmm2	\n\t"\
		"movaps	0x10(%%rdx),%%xmm3	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm0	\n\t"/* ss0 */\
		"mulpd	0x20(%%rsi),%%xmm1	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm2	\n\t"/* cc0 */\
		"mulpd	0x10(%%rsi),%%xmm3	\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t"\
		"addpd	%%xmm3,%%xmm0		\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm4	\n\t"/* cc0 */\
		"mulpd	0x10(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm6	\n\t"/* ss0 */\
		"mulpd	0x20(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"subpd	%%xmm0,%%xmm6		\n\t"\
		"subpd	%%xmm1,%%xmm7		\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"addpd	0x10(%%rbx),%%xmm2	\n\t"\
		"subpd	    (%%rbx),%%xmm3	\n\t"\
		"mulpd	    (%%rsi),%%xmm2	\n\t"/* isrt2 */\
		"mulpd	    (%%rsi),%%xmm3	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 1*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + 5*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + 9*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + d*ostride */\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%r12)	\n\t"\
		"movaps	%%xmm3,0x10(%%r12)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm4		\n\t"\
		"addpd	%%xmm3,%%xmm5		\n\t"\
		"movaps	%%xmm4,    (%%r10)	\n\t"\
		"movaps	%%xmm5,0x10(%%r10)	\n\t"\
		"subpd	%%xmm7,%%xmm0		\n\t"\
		"subpd	%%xmm6,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%r13)	\n\t"\
		"movaps	%%xmm1,0x10(%%r11)	\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm7		\n\t"\
		"addpd	%%xmm1,%%xmm6		\n\t"\
		"movaps	%%xmm7,    (%%r11)	\n\t"\
		"movaps	%%xmm6,0x10(%%r13)	\n\t"\
	/* Block 2: Combine 2-output of each radix-4, i.e. inputs from __in0 + [2,6,a,e]*istride: */\
		"movaps	(%%rsi),%%xmm2	/* isrt2 */\n\t"\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm0	\n\t"\
		"movaps	0x10(%%rdx),%%xmm1	\n\t"\
		"addpd	0x10(%%rcx),%%xmm4	\n\t"\
		"subpd	    (%%rcx),%%xmm5	\n\t"\
		"subpd	0x10(%%rdx),%%xmm0	\n\t"\
		"addpd	    (%%rdx),%%xmm1	\n\t"\
		"mulpd	%%xmm2,%%xmm4		\n\t"\
		"mulpd	%%xmm2,%%xmm5		\n\t"\
		"mulpd	%%xmm2,%%xmm0		\n\t"\
		"mulpd	%%xmm2,%%xmm1		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"subpd	%%xmm0,%%xmm4		\n\t"\
		"subpd	%%xmm1,%%xmm5		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	0x10(%%rbx),%%xmm0	\n\t"\
		"subpd	    (%%rbx),%%xmm1	\n\t"\
		"addpd	    (%%rax),%%xmm3	\n\t"\
		"addpd	0x10(%%rax),%%xmm2	\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 2*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + 6*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + a*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + e*ostride */\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"movaps	%%xmm3,    (%%r12)	\n\t"\
		"movaps	%%xmm1,0x10(%%r12)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"movaps	%%xmm4,    (%%r10)	\n\t"\
		"movaps	%%xmm5,0x10(%%r10)	\n\t"\
		"subpd	%%xmm7,%%xmm0		\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t"\
		"movaps	%%xmm0,    (%%r13)	\n\t"\
		"movaps	%%xmm2,0x10(%%r11)	\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm7		\n\t"\
		"addpd	%%xmm2,%%xmm6		\n\t"\
		"movaps	%%xmm7,    (%%r11)	\n\t"\
		"movaps	%%xmm6,0x10(%%r13)	\n\t"\
	/* Block 3: Combine 3-output of each radix-4, i.e. inputs from __in0 + [3,7,b,f]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
		"movaps	    (%%rdx),%%xmm0	\n\t"\
		"movaps	0x10(%%rdx),%%xmm1	\n\t"\
		"movaps	    (%%rdx),%%xmm2	\n\t"\
		"movaps	0x10(%%rdx),%%xmm3	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm0	\n\t"/* cc0 */\
		"mulpd	0x10(%%rsi),%%xmm1	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm2	\n\t"/* ss0 */\
		"mulpd	0x20(%%rsi),%%xmm3	\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t"\
		"addpd	%%xmm3,%%xmm0		\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	\n\t"/* ss0 */\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"/* cc0 */\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"subpd	%%xmm0,%%xmm6		\n\t"\
		"subpd	%%xmm1,%%xmm7		\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"subpd	0x10(%%rbx),%%xmm2	\n\t"\
		"addpd	    (%%rbx),%%xmm3	\n\t"\
		"mulpd	    (%%rsi),%%xmm2	\n\t"/* isrt2 */\
		"mulpd	    (%%rsi),%%xmm3	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"addq	$%c[__o1],%%r10	\n\t"/* __out0 + 3*ostride */\
		"addq	$%c[__o1],%%r12	\n\t"/* __out0 + 7*ostride */\
		"addq	$%c[__o1],%%r11	\n\t"/* __out0 + b*ostride */\
		"addq	$%c[__o1],%%r13	\n\t"/* __out0 + f*ostride */\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"movaps	%%xmm0,    (%%r12)	\n\t"\
		"movaps	%%xmm1,0x10(%%r12)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"movaps	%%xmm6,    (%%r10)	\n\t"\
		"movaps	%%xmm7,0x10(%%r10)	\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm2,    (%%r13)	\n\t"\
		"movaps	%%xmm3,0x10(%%r11)	\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm5,    (%%r11)	\n\t"\
		"movaps	%%xmm4,0x10(%%r13)	\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__o1] "e" (Xo1)\
		 ,[__o2] "e" (Xo2)\
		 ,[__o3] "e" (Xo3)\
		 ,[__o4] "e" (Xo4)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__c1] "m" (Xc1)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// DIF version of above shares same sincos layout & data:
	/* Jan 2021: To restore Clang-on-Armv8 builbadility, needed to cut total [#args + #GPRs] substantially;
	not necessary for Clang/x86_64 builds, but want to preserve same-macro-interface-ness. Use that for this
	particular macro, the original 4 input offsets are const-multiple stride*[1,2,3,4] - keep just stride*[1,4].
	What were formerly 16 are padded-index, and possibly permuted, i.e. base_ptr + perm(p[0-f];
	these we now input via base-pointer and (int *)offset-array pointer. Thus we save total 2+14 = 16 args.
	*/
	#define SSE2_RADIX16_DIF_TWIDDLE_OOP(Xin0,Xi1,Xi4, Xout0,Xoff, Xisrt2,Xc1)\
	{\
	__asm__ volatile (\
	/*...Block 0: Do in-place, i.e. outputs into __in0 + [0,1,2,3]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i1](%%rax),%%rcx	\n\t"/* __in0 +   istride */\
		"leaq	%c[__i1](%%rcx),%%rbx	\n\t"/* __in0 + 2*istride */\
		"leaq	%c[__i1](%%rbx),%%rdx	\n\t"/* __in0 + 3*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"movq	%[__c1],%%rsi 	/* Roots sets c1-15 same as for DIT, w/c1 as base-ptr */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	    (%%rsi),%%xmm4	\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x20,%%rsi 	/* c2,3 */\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* c3 */\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	/* c2 */\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"addpd	    (%%rax),%%xmm6	\n\t"\
		"addpd	0x10(%%rax),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into temporary-array slots: */\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm0,    (%%rcx)	\n\t"\
		"movaps	%%xmm2,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm5,    (%%rdx)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,0x10(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 1: outputs into __in0 + [4,5,6,7]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 4*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + 7*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* c4,5 */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rsi),%%xmm6	\n\t"\
		"movaps	0x10(%%rsi),%%xmm7	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"mulpd	%%xmm6,%%xmm0		/* c4 */\n\t"\
		"mulpd	%%xmm6,%%xmm1		\n\t"\
		"mulpd	%%xmm7,%%xmm2		\n\t"\
		"mulpd	%%xmm7,%%xmm3		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm1		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* c5 */\n\t"\
		"subpd	%%xmm3,%%xmm0		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* c6,7 */\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* c7 */\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	/* c6 */\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"addpd	    (%%rax),%%xmm6	\n\t"\
		"addpd	0x10(%%rax),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into temporary-array slots: */\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm0,    (%%rcx)	\n\t"\
		"movaps	%%xmm2,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm5,    (%%rdx)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,0x10(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 2: outputs into __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + 8*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + b*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* c8,9 */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rsi),%%xmm6	\n\t"\
		"movaps	0x10(%%rsi),%%xmm7	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"mulpd	%%xmm6,%%xmm0		/* c8 */\n\t"\
		"mulpd	%%xmm6,%%xmm1		\n\t"\
		"mulpd	%%xmm7,%%xmm2		\n\t"\
		"mulpd	%%xmm7,%%xmm3		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm1		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* c9 */\n\t"\
		"subpd	%%xmm3,%%xmm0		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* ca,b */\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* cb */\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	/* ca */\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"addpd	    (%%rax),%%xmm6	\n\t"\
		"addpd	0x10(%%rax),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into temporary-array slots: */\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm0,    (%%rcx)	\n\t"\
		"movaps	%%xmm2,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm5,    (%%rdx)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,0x10(%%rbx)	\n\t"\
		"\n\t"\
	/*...Block 3: outputs into __in0 + [c,d,e,f]*istride: */\
		"addq	$%c[__i4],%%rax	\n\t"/* __in0 + c*istride */\
		"addq	$%c[__i4],%%rcx	\n\t"/* __in0 + d*istride */\
		"addq	$%c[__i4],%%rbx	\n\t"/* __in0 + e*istride */\
		"addq	$%c[__i4],%%rdx	\n\t"/* __in0 + f*istride */\
		"/* Do	the p0,1 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* cc,d */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rsi),%%xmm6	\n\t"\
		"movaps	0x10(%%rsi),%%xmm7	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"mulpd	%%xmm6,%%xmm0		/* cc */\n\t"\
		"mulpd	%%xmm6,%%xmm1		\n\t"\
		"mulpd	%%xmm7,%%xmm2		\n\t"\
		"mulpd	%%xmm7,%%xmm3		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm1		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* cd */\n\t"\
		"subpd	%%xmm3,%%xmm0		\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"/* Do	the p2,3 combo: */	\n\t"\
		"addq	$0x40,%%rsi 	/* ce,f */\n\t"\
		"movaps	    (%%rdx),%%xmm4	\n\t"\
		"movaps	0x10(%%rdx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm4	/* cf */\n\t"\
		"mulpd	0x20(%%rsi),%%xmm5	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x30(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"mulpd	    (%%rsi),%%xmm4	/* ce */\n\t"\
		"mulpd	    (%%rsi),%%xmm5	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm6	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm7	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"addpd	    (%%rax),%%xmm6	\n\t"\
		"addpd	0x10(%%rax),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into temporary-array slots: */\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t"\
		"movaps	%%xmm0,    (%%rcx)	\n\t"\
		"movaps	%%xmm2,    (%%rbx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rcx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm2,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm5,    (%%rdx)	\n\t"\
		"movaps	%%xmm7,0x10(%%rax)	\n\t"\
		"movaps	%%xmm4,0x10(%%rbx)	\n\t"\
	/*************************************************************************************/\
	/*  And now do four more radix-4 transforms, including the internal twiddle factors: */\
	/*************************************************************************************/\
	/* Block 0: Combine 0-output of each radix-4, i.e. inputs from __in0 + [0,4,8,c]*istride: */\
		"movq	%[__in0],%%rax		\n\t"\
		"leaq	%c[__i4](%%rax),%%rbx	\n\t"/* __in0 +   [4*istride] */\
		"leaq	%c[__i4](%%rbx),%%rcx	\n\t"/* __in0 + 2*[4*istride] */\
		"leaq	%c[__i4](%%rcx),%%rdx	\n\t"/* __in0 + 3*[4*istride] */\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t"\
		"subpd	%%xmm7,%%xmm3		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
	/* Load output base-address into r8 and offset-array pointer into r9: */\
	"movq	%[__out0],%%r8	\n\t	movq	%[__off],%%r9	\n\t"\
	/* Block 0: r0-3 */\
		"movslq		    (%%r9),%%r10	\n\t"/* off0-3 */\
		"movslq		0x04(%%r9),%%r11	\n\t"\
		"movslq		0x08(%%r9),%%r12	\n\t"\
		"movslq		0x0c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off0-3 */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%r11)\n\t"\
		"movaps	%%xmm2,    (%%r11)	\n\t"\
		"movaps	%%xmm3,0x10(%%r11)	\n\t"\
		"addpd	%%xmm2,	%%xmm6	\n\t"\
		"addpd	%%xmm3,	%%xmm7	\n\t"\
		"movaps	%%xmm6,    (%%r10)	\n\t"\
		"movaps	%%xmm7,0x10(%%r10)	\n\t"\
		"subpd	%%xmm5,	%%xmm0	\n\t"\
		"subpd	%%xmm4,	%%xmm1	\n\t"\
		"addpd	%%xmm5,	%%xmm5	\n\t"\
		"addpd	%%xmm4,	%%xmm4	\n\t"\
		"movaps	%%xmm0,    (%%r12)	\n\t"\
		"movaps	%%xmm1,0x10(%%r13)	\n\t"\
		"addpd	%%xmm0,	%%xmm5	\n\t"\
		"addpd	%%xmm1,	%%xmm4	\n\t"\
		"movaps	%%xmm5,    (%%r13)	\n\t"\
		"movaps	%%xmm4,0x10(%%r12)	\n\t"\
	/* Block 2: Combine 2-output of each radix-4, i.e. inputs from __in0 + [4,5,6,7]*istride: */\
		"movq	%[__isrt2],%%rsi 	\n\t"\
		"movaps	(%%rsi),%%xmm3	/* isrt2 */\n\t"\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 1*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 5*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + 9*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + d*istride */\
	"prefetcht1	0x100(%%r13)\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"mulpd	%%xmm3,%%xmm4		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"mulpd	%%xmm3,%%xmm5		\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"mulpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"mulpd	%%xmm3,%%xmm7		\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	%%xmm3,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm4		\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t"\
		"subpd	%%xmm6,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm3		\n\t"\
		"addpd	%%xmm4,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm2		\n\t"\
		"addpd	%%xmm7,%%xmm6		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm3		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t"\
		"subpd	%%xmm6,%%xmm1		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"movslq		0x10(%%r9),%%r10	\n\t"/* off4-7 */\
		"movslq		0x14(%%r9),%%r11	\n\t"\
		"movslq		0x18(%%r9),%%r12	\n\t"\
		"movslq		0x1c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off4-7 */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%r11)\n\t"\
		"movaps	%%xmm0,    (%%r11)	\n\t"\
		"movaps	%%xmm3,    (%%r12)	\n\t"\
		"movaps	%%xmm2,0x10(%%r11)	\n\t"\
		"movaps	%%xmm1,0x10(%%r13)	\n\t"\
		"addpd	%%xmm0,%%xmm4	\n\t"\
		"addpd	%%xmm3,%%xmm7	\n\t"\
		"addpd	%%xmm2,%%xmm5	\n\t"\
		"addpd	%%xmm1,%%xmm6	\n\t"\
		"movaps	%%xmm4,    (%%r10)	\n\t"\
		"movaps	%%xmm7,    (%%r13)	\n\t"\
		"movaps	%%xmm5,0x10(%%r10)	\n\t"\
		"movaps	%%xmm6,0x10(%%r12)	\n\t"\
	/* Block 1: Combine 1-output of each radix-4, i.e. inputs from __in0 + [8,9,a,b]*istride: */\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 2*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 6*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + a*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + e*istride */\
	"prefetcht1	0x100(%%r13)\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	0x10(%%rsi),%%xmm3	/* cc0, using isrt2 as base-ptr */\n\t"\
		"movaps	0x20(%%rsi),%%xmm2	/* ss0, using isrt2 as base-ptr */\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	%%xmm3,%%xmm4		\n\t"\
		"mulpd	%%xmm3,%%xmm5		\n\t"\
		"mulpd	%%xmm2,%%xmm6		\n\t"\
		"movaps	    (%%rdx),%%xmm0	\n\t"\
		"mulpd	%%xmm2,%%xmm7		\n\t"\
		"movaps	0x10(%%rdx),%%xmm1	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm0,%%xmm6		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm1,%%xmm7		\n\t"\
		"mulpd	%%xmm2,%%xmm6		\n\t"\
		"mulpd	%%xmm2,%%xmm7		\n\t"\
		"mulpd	%%xmm3,%%xmm0		\n\t"\
		"mulpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm0,%%xmm7		\n\t"\
		"subpd	%%xmm1,%%xmm6		\n\t"\
		"movaps	%%xmm4,%%xmm2		\n\t"\
		"movaps	%%xmm5,%%xmm3		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm6		\n\t"\
		"addpd	%%xmm3,%%xmm7		\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rsi),%%xmm1	/* isrt2 */\n\t"\
		"movaps	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm2		\n\t"\
		"addpd	%%xmm0,%%xmm3		\n\t"\
		"mulpd	%%xmm1,%%xmm2		\n\t"\
		"mulpd	%%xmm1,%%xmm3		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm3		\n\t"\
		"subpd	%%xmm4,%%xmm1		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"movslq		0x20(%%r9),%%r10	\n\t"/* off8-b */\
		"movslq		0x24(%%r9),%%r11	\n\t"\
		"movslq		0x28(%%r9),%%r12	\n\t"\
		"movslq		0x2c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + off8-b */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%r11)\n\t"\
		"movaps	%%xmm2,    (%%r11)	\n\t"\
		"movaps	%%xmm0,    (%%r12)	\n\t"\
		"movaps	%%xmm3,0x10(%%r11)	\n\t"\
		"movaps	%%xmm1,0x10(%%r13)	\n\t"\
		"addpd	%%xmm2,%%xmm6	\n\t"\
		"addpd	%%xmm0,%%xmm5	\n\t"\
		"addpd	%%xmm3,%%xmm7	\n\t"\
		"addpd	%%xmm1,%%xmm4	\n\t"\
		"movaps	%%xmm6,    (%%r10)	\n\t"\
		"movaps	%%xmm5,    (%%r13)	\n\t"\
		"movaps	%%xmm7,0x10(%%r10)	\n\t"\
		"movaps	%%xmm4,0x10(%%r12)	\n\t"\
	"/* Block 3: Combine 3-output of each radix-4, i.e. inputs from __in0 + [c,d,e,f]*istride: */\n\t"\
		"addq	$%c[__i1],%%rax	\n\t"/* __in0 + 3*istride */\
		"addq	$%c[__i1],%%rbx	\n\t"/* __in0 + 7*istride */\
		"addq	$%c[__i1],%%rcx	\n\t"/* __in0 + b*istride */\
		"addq	$%c[__i1],%%rdx	\n\t"/* __in0 + f*istride */\
	"prefetcht1	0x100(%%r13)\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	0x10(%%rsi),%%xmm2	/* cc0, using isrt2 as base-ptr */\n\t"\
		"movaps	0x20(%%rsi),%%xmm3	/* ss0, using isrt2 as base-ptr */\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t"\
		"mulpd	%%xmm3,%%xmm4		\n\t"\
		"mulpd	%%xmm3,%%xmm5		\n\t"\
		"mulpd	%%xmm2,%%xmm6		\n\t"\
		"movaps	    (%%rdx),%%xmm0	\n\t"\
		"mulpd	%%xmm2,%%xmm7		\n\t"\
		"movaps	0x10(%%rdx),%%xmm1	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm0,%%xmm6		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"movaps	%%xmm1,%%xmm7		\n\t"\
		"mulpd	%%xmm2,%%xmm6		\n\t"\
		"mulpd	%%xmm2,%%xmm7		\n\t"\
		"mulpd	%%xmm3,%%xmm0		\n\t"\
		"mulpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm0,%%xmm7		\n\t"\
		"subpd	%%xmm1,%%xmm6		\n\t"\
		"movaps	%%xmm4,%%xmm2		\n\t"\
		"movaps	%%xmm5,%%xmm3		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm6		\n\t"\
		"addpd	%%xmm3,%%xmm7		\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rsi),%%xmm1		/* isrt2 */\n\t"\
		"movaps	%%xmm2,%%xmm0		\n\t"\
		"addpd	%%xmm3,%%xmm2		\n\t"\
		"subpd	%%xmm0,%%xmm3		\n\t"\
		"mulpd	%%xmm1,%%xmm2		\n\t"\
		"mulpd	%%xmm1,%%xmm3		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"movslq		0x30(%%r9),%%r10	\n\t"/* offc-f */\
		"movslq		0x34(%%r9),%%r11	\n\t"\
		"movslq		0x38(%%r9),%%r12	\n\t"\
		"movslq		0x3c(%%r9),%%r13	\n\t"\
		"leaq	(%%r8,%%r10,8),%%r10	\n\t"/* out0 + offc-f */\
		"leaq	(%%r8,%%r11,8),%%r11	\n\t"\
		"leaq	(%%r8,%%r12,8),%%r12	\n\t"\
		"leaq	(%%r8,%%r13,8),%%r13	\n\t"\
	"prefetcht1	0x100(%%r11)\n\t"\
		"movaps	%%xmm0,    (%%r11)	\n\t"\
		"movaps	%%xmm2,    (%%r12)	\n\t"\
		"movaps	%%xmm1,0x10(%%r11)	\n\t"\
		"movaps	%%xmm3,0x10(%%r13)	\n\t"\
		"addpd	%%xmm0,%%xmm4	\n\t"\
		"addpd	%%xmm2,%%xmm7	\n\t"\
		"addpd	%%xmm1,%%xmm5	\n\t"\
		"addpd	%%xmm3,%%xmm6	\n\t"\
		"movaps	%%xmm4,    (%%r10)	\n\t"\
		"movaps	%%xmm7,    (%%r13)	\n\t"\
		"movaps	%%xmm5,0x10(%%r10)	\n\t"\
		"movaps	%%xmm6,0x10(%%r12)	\n\t"\
	"prefetcht1	0x100(%%r13)\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i4] "e" (Xi4)\
		 ,[__out0] "m" (Xout0)\
		 ,[__off] "m" (Xoff)\
		 ,[__isrt2] "m" (Xisrt2)\
		 ,[__c1] "m" (Xc1)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r8","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_CMUL_EXPO(XcA,XcB,XcAmB,XcApB)\
	{\
	__asm__ volatile (\
		"movq	%[__cA]		,%%rax\n\t"\
		"movq	%[__cB]		,%%rbx\n\t"\
		"movq	%[__cAmB]	,%%rcx\n\t"\
		"movq	%[__cApB]	,%%rdx\n\t"\
		"\n\t"\
		"movaps	    (%%rax),%%xmm0\n\t"\
		"movaps	0x10(%%rax),%%xmm2\n\t"\
		"movaps	    (%%rbx),%%xmm4\n\t"\
		"movaps	0x10(%%rbx),%%xmm5\n\t"\
		"movaps	%%xmm0,%%xmm1\n\t"\
		"movaps	%%xmm2,%%xmm3\n\t"\
		"\n\t"\
		"mulpd	%%xmm4,%%xmm0\n\t"\
		"mulpd	%%xmm5,%%xmm1\n\t"\
		"mulpd	%%xmm4,%%xmm2\n\t"\
		"mulpd	%%xmm5,%%xmm3\n\t"\
		"movaps	%%xmm0,%%xmm4\n\t"\
		"movaps	%%xmm1,%%xmm5\n\t"\
		"addpd	%%xmm3,%%xmm0\n\t"\
		"subpd	%%xmm2,%%xmm1\n\t"\
		"subpd	%%xmm3,%%xmm4\n\t"\
		"addpd	%%xmm2,%%xmm5\n\t"\
		"movaps	%%xmm0,    (%%rcx)\n\t"\
		"movaps	%%xmm1,0x10(%%rcx)\n\t"\
		"movaps	%%xmm4,    (%%rdx)\n\t"\
		"movaps	%%xmm5,0x10(%%rdx)\n\t"\
		:					/* outputs: none */\
		: [__cA]  "m" (XcA)	/* All inputs from memory addresses here */\
		 ,[__cB]  "m" (XcB)\
		 ,[__cAmB] "m" (XcAmB)\
		 ,[__cApB] "m" (XcApB)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5"		/* Clobbered registers */\
	);\
	}

	/*
	SSE2-ified version of PAIR_SQUARE_4. Data enter in [tAr, ~tDr], [tBr, ~tCr] pointer-pairs, where the imaginary part
	of each input pair is assumed offset +0x10 in memory from the real part, i.e. needs no explicit pointer reference.
	In terms of the scalar-double pair_square() function, [tAr, ~tDr] represent one set of data-double 128-bit-wide
	H[j],H~[N-j] data-to-be-combined, and [tBr, ~tCr] a second such set. Here is the processing flow in pair_square():

		// H[j]-H~[N-j] = (r1-r2,i1+i2); ()^2 = [(r1-r2)^2-(i1+i2)^2] + 2.I.[(r1-r2).(i1+i2)]
	// calculate 2nd square-like term and store in temp...
		re = (r2+i2)*(r2-i2);	// re := Re{H(n2-j)^2}
		im = r2*i2 + i2*r2;		// im := Im{H(n2-j)^2}
	// calculate difference terms...
		r2 = r1 - r2;			// r2 := Re{H(j)-H~(n2-j)}
		i2 = i1 + i2;			// i2 := Im{H(j)-H~(n2-j)}
	// now calculate 1st square-like term and store back in H(j) slot...
		tt = (r1+i1)*(r1-i1);		// r1 := Re{H(j)^2}
		i1 = r1*i1 + i1*r1; r1 = tt;// i1 := Im{H(j)^2}
	// calculate the complex products to build the second term...
		tt = (r2+i2)*(r2-i2);		// Re{(H[j] - H~[N/2-j])^2}
		i2 = r2*i2 + i2*r2; r2 = tt;// Im{(H[j] - H~[N/2-j])^2}
		tt = (cc*r2 - ss*i2);	// Re{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])^2/4}
		i2 = (ss*r2 + cc*i2);	// Im{(1 + exp(4*pi*I*j/N)) * (H[j] - H~[N/2-j])^2/4}
	// and now complete and store the results.
		*x1 = (r1-tt);	// Re{M(j)}
		*y1 = (i1-i2);	// Im{M(j)}
	// N-j terms are as above, but with the replacements: r1<-->r2, i1<-->i2, i3|-->-i3.
		*x2 = (re-tt);	// Re{M(N-j)}
		*y2 = (im+i2);	// Im{M(N-j)}

	For the sincos twiddles: using the notation of the scalar PAIR_SQUARE_4() macro,"__c" means [c0,s1], "__s" means [s0,c1].
	For these, due to the buterfly indexing pattern, we cannot assume that __s = __c + 0x10, so feed both pointers explicitly.

	NOTE: '~' in the above complex-arithmetic description and in the SIMD annotations below mean DIFFERENT THINGS.
	In the above complex-arithmetic ~ means complex conjugation; in the SIMD annotations below ~ means [lo,hi] doubles swapped in
	the SSE2 register, corresponding j1,j2-indexed data swaps in the input-argument ordering.
	We use shufpd xmm, xmm, 1 to swap lo and hi doubles of an xmm register for the various operations with one swapped input.
	*/
	#define PAIR_SQUARE_4_SSE2(XtAr, XtBr, XtCr, XtDr, Xc, Xs, Xforth)\
	{\
	__asm__ volatile (\
		/*   calculate cross-product terms...
			__rt=__tAr* ~tDr+__tAi* ~tDi; __rt=__rt+__rt;
			__it=__tAi* ~tDr-__tAr* ~tDi; __it=__it+__it;
		*/\
		"movq	%[__tDr]	,%%rdx\n\t"\
		"movq	%[__tAr]	,%%rax\n\t"\
		"\n\t"\
		"movaps	    (%%rdx)	,%%xmm6		/* tDr */\n\t"\
		"movaps	0x10(%%rdx)	,%%xmm7		/* tDi */\n\t"\
		"movaps	    (%%rax)	,%%xmm0		/* tAr */\n\t"\
		"movaps	0x10(%%rax)	,%%xmm3		/* tAi */\n\t"\
		"shufpd	$1	,%%xmm6	,%%xmm6	/*~tDr */\n\t"\
		"shufpd	$1	,%%xmm7	,%%xmm7	/*~tDi */\n\t"\
		"movaps	    (%%rax)	,%%xmm2		/* cpy tAr */\n\t"\
		"movaps	0x10(%%rax)	,%%xmm1		/* cpy tAi */\n\t"\
		"\n\t"\
		"mulpd	%%xmm6		,%%xmm0	/* tAr*~tDr */\n\t"\
		"mulpd	%%xmm7		,%%xmm3	/* tAi*~tDi */\n\t"\
		"mulpd	%%xmm6		,%%xmm1	/* tAi*~tDr */\n\t"\
		"mulpd	%%xmm7		,%%xmm2	/* tAr*~tDi */\n\t"\
		"addpd	%%xmm3		,%%xmm0	/* rt */\n\t"\
		"subpd	%%xmm2		,%%xmm1	/* it */\n\t"\
		"addpd	%%xmm0		,%%xmm0	/* rt=rt+rt */\n\t"\
		"addpd	%%xmm1		,%%xmm1	/* it=it+it; xmm2-7 free */\n\t"\
		/*
			__st=__tBr* ~tCr+__tBi* ~tCi; __st=__st+__st;
			__jt=__tBi* ~tCr-__tBr* ~tCi; __jt=__jt+__jt;
		"*/\
		"movq	%[__tCr]	,%%rcx\n\t"\
		"movq	%[__tBr]	,%%rbx\n\t"\
		"\n\t"\
		"movaps	    (%%rcx)	,%%xmm6		/* tCr */\n\t"\
		"movaps	0x10(%%rcx)	,%%xmm7		/* tCi */\n\t"\
		"movaps	    (%%rbx)	,%%xmm2		/* tBr */\n\t"\
		"movaps	0x10(%%rbx)	,%%xmm5		/* tBi */\n\t"\
		"shufpd	$1	,%%xmm6	,%%xmm6	/*~tCr */\n\t"\
		"shufpd	$1	,%%xmm7	,%%xmm7	/*~tCi */\n\t"\
		"movaps	    (%%rbx)	,%%xmm4		/* cpy tBr */\n\t"\
		"movaps	0x10(%%rbx)	,%%xmm3		/* cpy tBi */\n\t"\
		"\n\t"\
		"mulpd	%%xmm6		,%%xmm2	/* tBr*~tCr */\n\t"\
		"mulpd	%%xmm7		,%%xmm5	/* tBi*~tCi */\n\t"\
		"mulpd	%%xmm6		,%%xmm3	/* tBi*~tCr */\n\t"\
		"mulpd	%%xmm7		,%%xmm4	/* tBr*~tCi */\n\t"\
		"addpd	%%xmm5		,%%xmm2	/* st */\n\t"\
		"subpd	%%xmm4		,%%xmm3	/* jt */\n\t"\
		"addpd	%%xmm2		,%%xmm2	/* st=st+st */\n\t"\
		"addpd	%%xmm3		,%%xmm3	/* jt=jt+jt; xmm4-7 free */\n\t"\
		/*   now calculate square terms and __store back in the same temporaries:
			__tmp = (__tAr+__tAi)*(__tAr-__tAi); __tAi=__tAr*__tAi; __tAi=__tAi+__tAi; __tAr=__tmp;
		*/\
		"movaps	    (%%rax)	,%%xmm4		/* __tAr */\n\t"\
		"movaps	0x10(%%rax)	,%%xmm5		/* __tAi */\n\t"\
		"subpd	%%xmm5		,%%xmm4		/* (__tAr-__tAi) */\n\t"\
		"addpd	%%xmm5		,%%xmm5		/*      2*__tAi  */\n\t"\
		"addpd	%%xmm4		,%%xmm5		/* (__tAr+__tAi) */\n\t"\
		"mulpd	%%xmm5		,%%xmm4		/*>__tAr */\n\t"\
		"\n\t"\
		"movaps	    (%%rax)	,%%xmm5		/* __tAr */\n\t"\
		"mulpd	0x10(%%rax)	,%%xmm5		/* __tAr*__tAi */\n\t"\
		"addpd	%%xmm5		,%%xmm5		/*>__tAi */\n\t"\
		"movaps	%%xmm4	,    (%%rax)	/* tmp store >__tAr */\n\t"\
		"movaps	%%xmm5	,0x10(%%rax)	/* tmp store >__tAi */\n\t"\
		"\n\t"\
		"subpd	%%xmm4		,%%xmm0	/* rt-__tAr */\n\t"\
		"subpd	%%xmm5		,%%xmm1	/* it-__tAi; xmm4-7 free */\n\t"\
		"\n\t"\
		/*	__tmp=(__tBr+__tBi)*(__tBr-__tBi); __tBi=__tBr*__tBi; __tBi=__tBi+__tBi; __tBr=__tmp;
			[Can be done in parallel with above segment]
		*/\
		"movaps	    (%%rbx)	,%%xmm6		/* __tBr */\n\t"\
		"movaps	0x10(%%rbx)	,%%xmm7		/* __tBi */\n\t"\
		"subpd	%%xmm7		,%%xmm6		/* (__tBr-__tBi) */\n\t"\
		"addpd	%%xmm7		,%%xmm7		/*      2*__tBi  */\n\t"\
		"addpd	%%xmm6		,%%xmm7		/* (__tBr+__tBi) */\n\t"\
		"mulpd	%%xmm7		,%%xmm6		/*>__tBr */\n\t"\
		"\n\t"\
		"movaps	    (%%rbx)	,%%xmm7		/* __tBr */\n\t"\
		"mulpd	0x10(%%rbx)	,%%xmm7		/* __tBr*__tBi */\n\t"\
		"addpd	%%xmm7		,%%xmm7		/*>__tBi */\n\t"\
		"movaps	%%xmm6	,    (%%rbx)	/* tmp store >__tBr */\n\t"\
		"movaps	%%xmm7	,0x10(%%rbx)	/* tmp store >__tBi */\n\t"\
		"\n\t"\
		"subpd	%%xmm6		,%%xmm2	/* st-__tBr */\n\t"\
		"subpd	%%xmm7		,%%xmm3	/* jt-__tBi; xmm4-7 free */\n\t"\
		/*
			__tmp=(__tDr+__tDi)*(__tDr-__tDi); __tDi=__tDr*__tDi; __tDi=__tDi+__tDi; __tDr=__tmp;
		*/\
		"movaps	    (%%rdx)	,%%xmm4		/* __tDr */\n\t"\
		"movaps	0x10(%%rdx)	,%%xmm5		/* __tDi */\n\t"\
		"subpd	%%xmm5		,%%xmm4		/* (__tDr-__tDi) */\n\t"\
		"addpd	%%xmm5		,%%xmm5		/*      2*__tDi  */\n\t"\
		"addpd	%%xmm4		,%%xmm5		/* (__tDr+__tDi) */\n\t"\
		"mulpd	%%xmm5		,%%xmm4		/*>__tDr */\n\t"\
		"\n\t"\
		"movaps	    (%%rdx)	,%%xmm5		/* __tDr */\n\t"\
		"mulpd	0x10(%%rdx)	,%%xmm5		/* __tDr*__tDi */\n\t"\
		"addpd	%%xmm5		,%%xmm5		/*>__tDi */\n\t"\
		"movaps	%%xmm4	,    (%%rdx)	/* tmp store ~tDr */\n\t"\
		"movaps	%%xmm5	,0x10(%%rdx)	/* tmp store ~tDi */\n\t"\
		"shufpd	$1	,%%xmm4	,%%xmm4	/*~tDr */\n\t"\
		"shufpd	$1	,%%xmm5	,%%xmm5	/*~tDi */\n\t"\
		"\n\t"\
		"subpd	%%xmm4		,%%xmm0	/* rt-__tAr- ~tDr */\n\t"\
		"addpd	%%xmm5		,%%xmm1	/* it-__tAi+ ~tDi; xmm4-7 free */\n\t"\
		/*
			__tmp = (__tCr+__tCi)*(__tCr-__tCi); __tCi=__tCr*__tCi; __tCi=__tCi+__tCi; __tCr=__tmp;
			[Can be done in parallel with above segment] */\
		"movaps	    (%%rcx)	,%%xmm6		/* __tCr */\n\t"\
		"movaps	0x10(%%rcx)	,%%xmm7		/* __tCi */\n\t"\
		"subpd	%%xmm7		,%%xmm6		/* (__tCr-__tCi) */\n\t"\
		"addpd	%%xmm7		,%%xmm7		/*      2*__tCi  */\n\t"\
		"addpd	%%xmm6		,%%xmm7		/* (__tCr+__tCi) */\n\t"\
		"mulpd	%%xmm7		,%%xmm6		/*>__tCr */\n\t"\
		"\n\t"\
		"movaps	    (%%rcx)	,%%xmm7		/* __tCr */\n\t"\
		"mulpd	0x10(%%rcx)	,%%xmm7		/* __tCr*__tCi */\n\t"\
		"addpd	%%xmm7		,%%xmm7		/*>__tCi */\n\t"\
		"movaps	%%xmm6	,    (%%rcx)	/* tmp store ~tCr */\n\t"\
		"movaps	%%xmm7	,0x10(%%rcx)	/* tmp store ~tCi */\n\t"\
		"shufpd	$1	,%%xmm6	,%%xmm6	/*~tCr */\n\t"\
		"shufpd	$1	,%%xmm7	,%%xmm7	/*~tCi */\n\t"\
		"\n\t"\
		"subpd	%%xmm6		,%%xmm2	/* st-__tBr- ~tCr */\n\t"\
		"addpd	%%xmm7		,%%xmm3	/* jt-__tBi+ ~tCi; xmm4-7 free */\n\t"\
		/*
			__tmp=((1.0+__c)*__rt-__s*__it)*0.25;
			__it =((1.0+__c)*__it+__s*__rt)*0.25;	__rt=__tmp;
			[Can be done in parallel with above segment]
		*/\
		"movq	%[__c]		,%%rax\n\t"\
		"movq	%[__s]		,%%rbx\n\t"\
		"movq	%[__forth]	,%%rdx\n\t"\
		"movaps	%%xmm0		,%%xmm4		/* cpy rt */\n\t"\
		"movaps	%%xmm1		,%%xmm5		/* cpy it */\n\t"\
		"mulpd	(%%rax)		,%%xmm0		/* c*rt */\n\t"\
		"mulpd	(%%rax)		,%%xmm1		/* c*it */\n\t"\
		"addpd	%%xmm4		,%%xmm0		/* (c+1.0)*rt */\n\t"\
		"addpd	%%xmm5		,%%xmm1		/* (c+1.0)*it */\n\t"\
		"mulpd	(%%rbx)		,%%xmm4		/* s*rt */\n\t"\
		"mulpd	(%%rbx)		,%%xmm5		/* s*it */\n\t"\
		"subpd	%%xmm5		,%%xmm0		/* (c+1.0)*rt-s*it */\n\t"\
		"addpd	%%xmm4		,%%xmm1		/* (c+1.0)*it+s*rt; xmm4,5 free */\n\t"\
		"mulpd	(%%rdx)		,%%xmm0	/* -rt Both of these inherit the sign flip [w.r.to the non-SSE2 PAIR_SQUARE_4 macro] */\n\t"\
		"mulpd	(%%rdx)		,%%xmm1	/* -it that resulted from the in-place-friendlier (rt-__tAr- ~tDr) reordering above. */\n\t"\
		/*
			__tmp=((1.0-__s)*__st-__c*__jt)*0.25;
			__jt =((1.0-__s)*__jt+__c*__st)*0.25	__st=__tmp;
			[Can be done in parallel with above segment]
		*/\
		"movaps	%%xmm2		,%%xmm6		/* cpy st */\n\t"\
		"movaps	%%xmm3		,%%xmm7		/* cpy jt */\n\t"\
		"mulpd	(%%rbx)		,%%xmm2		/* s*st */\n\t"\
		"mulpd	(%%rbx)		,%%xmm3		/* s*jt */\n\t"\
		"subpd	%%xmm6		,%%xmm2		/* (s-1.0)*st, note sign flip! */\n\t"\
		"subpd	%%xmm7		,%%xmm3		/* (s-1.0)*jt, note sign flip! */\n\t"\
		"mulpd	(%%rax)		,%%xmm6		/* c*st */\n\t"\
		"mulpd	(%%rax)		,%%xmm7		/* c*jt */\n\t"\
		"addpd	%%xmm7		,%%xmm2		/* -[(1.0-s)*st-c*jt] */\n\t"\
		"subpd	%%xmm6		,%%xmm3		/* -[(1.0-s)*jt+c*st]; xmm6,7 free */\n\t"\
		"mulpd	(%%rdx)		,%%xmm2	/* +st Sign flip due to (s-1.0) reordering here */\n\t"\
		"mulpd	(%%rdx)		,%%xmm3	/* +jt cancels earlier one due to in-place-friendlier (st-__tBr- ~tCr) reordering above. */\n\t"\
		/*...and now complete and store the results. We flip the signs on st and jt here to undo the above -st,-jt negations. */\
		/*	__tAr = (__tAr+__rt);
			__tAi = (__tAi+__it);
			__tBr = (__tBr-__st);
			__tBi = (__tBi-__jt);
		*/\
		"movq	%[__tAr]	,%%rax\n\t"\
		"movq	%[__tBr]	,%%rbx\n\t"\
		"\n\t"\
		"movaps	    (%%rax)	,%%xmm4		/* __tAr */\n\t"\
		"movaps	0x10(%%rax)	,%%xmm5		/* __tAi */\n\t"\
		"movaps	    (%%rbx)	,%%xmm6		/* __tBr */\n\t"\
		"movaps	0x10(%%rbx)	,%%xmm7		/* __tBi */\n\t"\
		"addpd	%%xmm0		,%%xmm4		/* (__tAr+__rt) */\n\t"\
		"addpd	%%xmm1		,%%xmm5		/* (__tAi+__it) */\n\t"\
		"subpd	%%xmm2		,%%xmm6		/* (__tBr-__st) */\n\t"\
		"subpd	%%xmm3		,%%xmm7		/* (__tBi-__jt) */\n\t"\
		"movaps	%%xmm4	,    (%%rax)	/* store >__tAr */\n\t"\
		"movaps	%%xmm5	,0x10(%%rax)	/* store >__tAi */\n\t"\
		"movaps	%%xmm6	,    (%%rbx)	/* store >__tBr */\n\t"\
		"movaps	%%xmm7	,0x10(%%rbx)	/* store >__tBi */\n\t"\
		/*...N-j terms are as above, but with the replacements: __tAr<--> ~tDr, __tAi<--> ~tDi, __it|-->-__it. */\
		/*	__tDr = (__tDr+ ~rt);
			__tDi = (__tDi- ~it);
			__tCr = (__tCr- ~st);
			__tCi = (__tCi+ ~jt);
		*/\
		"movq	%[__tCr]	,%%rcx\n\t"\
		"movq	%[__tDr]	,%%rdx\n\t"\
		"\n\t"\
		"shufpd	$1	,%%xmm0	,%%xmm0		/* ~rt */\n\t"\
		"shufpd	$1	,%%xmm1	,%%xmm1		/* ~it */\n\t"\
		"shufpd	$1	,%%xmm2	,%%xmm2		/* ~st */\n\t"\
		"shufpd	$1	,%%xmm3	,%%xmm3		/* ~jt */\n\t"\
		"\n\t"\
		"movaps	    (%%rdx)	,%%xmm4		/* __tDr */\n\t"\
		"movaps	0x10(%%rdx)	,%%xmm5		/* __tDi */\n\t"\
		"movaps	    (%%rcx)	,%%xmm6		/* __tCr */\n\t"\
		"movaps	0x10(%%rcx)	,%%xmm7		/* __tCi */\n\t"\
		"addpd	%%xmm0		,%%xmm4		/* (__tDr+ ~rt) */\n\t"\
		"subpd	%%xmm1		,%%xmm5		/* (__tDi- ~it) */\n\t"\
		"subpd	%%xmm2		,%%xmm6		/* (__tCr- ~st) */\n\t"\
		"addpd	%%xmm3		,%%xmm7		/* (__tCi+ ~jt) */\n\t"\
		"movaps	%%xmm4	,    (%%rdx)	/* store >__tDr */\n\t"\
		"movaps	%%xmm5	,0x10(%%rdx)	/* store >__tDi */\n\t"\
		"movaps	%%xmm6	,    (%%rcx)	/* store >__tCr */\n\t"\
		"movaps	%%xmm7	,0x10(%%rcx)	/* store >__tCi */\n\t"\
		/* Cost: [64 vector-load/store (16 implicit), 12 shufpd, 48 addpd, 28 mulpd, 4 vector-register-copy] */\
		:					/* outputs: none */\
		: [__tAr] "m" (XtAr)	/* All inputs from memory addresses here */\
		 ,[__tBr] "m" (XtBr)\
		 ,[__tCr] "m" (XtCr)\
		 ,[__tDr] "m" (XtDr)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// Sep 2019: 2-input FFT(a)*FFT(b) version of above PAIR_SQUARE_4_SSE2 macro, based on PAIR_MUL_4 macro in pair_square.h:
	// NOTE: Unlike the PAIR_SQUARE_4 version of this macro, the MUL version assumes the sincos terms premultiplied by 1/4!
	#define PAIR_MUL_4_SSE2(XA0,XA1,XA2,XA3, XB0,XB1,XB2,XB3, Xc,Xs,Xforth)\
	{\
	__asm__ volatile (\
		/* Load a2,a3 and b2,b3, d0,d1-swap, then compute
			t0 = ~a3r*~b3r - ~a3i*~b3i, t2 = ~a3r*~b3i + ~a3i*~b3r
			t1 = ~a2r*~b2r - ~a2i*~b2i, t3 = ~a2r*~b2i + ~a2i*~b2r
		*/\
		"movq	%[__A2]	,%%rcx	\n\t"\
		"movq	%[__A3]	,%%rdx	\n\t"\
		"movq	%[__B2]	,%%rdi	\n\t"\
		"movq	%[__B3]	,%%rsi	\n\t"\
		/* Must load double-pairs-to-be-swapped into regs first, since SHUFPD takes low double from DEST and high from SRC: */\
		"movaps	    (%%rcx),%%xmm0		\n\t	shufpd	$1,%%xmm0,%%xmm0	\n\t"/* ~a2r */\
		"movaps	0x10(%%rcx),%%xmm1		\n\t	shufpd	$1,%%xmm1,%%xmm1	\n\t"/* ~a2i */\
		"movaps	    (%%rdi),%%xmm4		\n\t	shufpd	$1,%%xmm4,%%xmm4	\n\t"/* ~b2r */\
		"movaps	0x10(%%rdi),%%xmm5		\n\t	shufpd	$1,%%xmm5,%%xmm5	\n\t"/* ~b2i */\
		"movaps	%%xmm0	,%%xmm8			\n\t	mulpd	%%xmm4	,%%xmm8		\n\t"/* ~a2r*~b2r */\
		"movaps	%%xmm1	,%%xmm10		\n\t	mulpd	%%xmm5	,%%xmm10	\n\t"/* ~a2i*~b2i */\
		"movaps	%%xmm5	,%%xmm11		\n\t	mulpd	%%xmm0	,%%xmm11	\n\t"/* ~a2r*~b2i */\
		"movaps	%%xmm4	,%%xmm9			\n\t	mulpd	%%xmm1	,%%xmm9		\n\t"/* ~a2i*~b2r */\
		"subpd	%%xmm10	,%%xmm8			\n\t	addpd	%%xmm11	,%%xmm9		\n\t"/* t1,t3 */\
		"movaps	    (%%rdx),%%xmm2		\n\t	shufpd	$1,%%xmm2,%%xmm2	\n\t"/* ~a3r */\
		"movaps	0x10(%%rdx),%%xmm3		\n\t	shufpd	$1,%%xmm3,%%xmm3	\n\t"/* ~a3i */\
		"movaps	    (%%rsi),%%xmm6		\n\t	shufpd	$1,%%xmm6,%%xmm6	\n\t"/* ~b3r */\
		"movaps	0x10(%%rsi),%%xmm7		\n\t	shufpd	$1,%%xmm7,%%xmm7	\n\t"/* ~b3i */\
		/* t1,3 not needed until final butterfly sequence, so write back to A2,3 memlocs: */\
		"movaps	%%xmm8	,    (%%rcx)	\n\t	movq	%[__A0]	,%%rax	\n\t"\
		"movaps	%%xmm9	,0x10(%%rcx)	\n\t	movq	%[__A1]	,%%rbx	\n\t"\
		"movaps	%%xmm2	,%%xmm8			\n\t	mulpd	%%xmm6	,%%xmm8		\n\t"/* ~a3r*~b3r */\
		"movaps	%%xmm3	,%%xmm10		\n\t	mulpd	%%xmm7	,%%xmm10	\n\t"/* ~a3i*~b3i */\
		"movaps	%%xmm7	,%%xmm11		\n\t	mulpd	%%xmm2	,%%xmm11	\n\t"/* ~a3r*~b3i */\
		"movaps	%%xmm6	,%%xmm9			\n\t	mulpd	%%xmm3	,%%xmm9		\n\t"/* ~a3i*~b3r */\
		"subpd	%%xmm10	,%%xmm8			\n\t	addpd	%%xmm11	,%%xmm9		\n\t"/* t0,t2 */\
		/* t0,2 not needed until final butterfly sequence, so write back to A2,3 memlocs: */\
		"movaps	%%xmm8	,    (%%rdx)	\n\t	movq	%[__B0]	,%%rdi	\n\t"\
		"movaps	%%xmm9	,0x10(%%rdx)	\n\t	movq	%[__B1]	,%%rsi	\n\t"\
	/* a2,3 in xmm0-3, b2,3 in xmm4-7, t1,3 in (rcx), t0,2 in (rdx) */\
		/* calculate difference terms...these need the [a,b][2|3] vector-data to be d0,1-swapped:
			~a3r -= a0r, ~a3i += a0i,
			~a2r -= a1r, ~a2i += a1i, similar for b-data, but move ~b2 -+ b1 down to just before a1*b1 cmul to free up 2 regs.
		*/\
/*** Need ~a3r = a0r - ~a3r, not ~a3r -= a0r! [Similar for a2r,b3r,b2r] ***
************** As currently, a2r,a3r,b2r,b3r all negated! ****************/\
		"movaps	    (%%rax)	,%%xmm8		\n\t	subpd	%%xmm8	,%%xmm2		\n\t"/* ~a3r -= a0r */\
		"movaps	0x10(%%rax)	,%%xmm9		\n\t	addpd	%%xmm9	,%%xmm3		\n\t"/* ~a3i += a0i */\
		"movaps	    (%%rbx)	,%%xmm10	\n\t	subpd	%%xmm10	,%%xmm0		\n\t"/* ~a2r -= a1r */\
		"movaps	0x10(%%rbx)	,%%xmm11	\n\t	addpd	%%xmm11	,%%xmm1		\n\t"/* ~a2i += a1i */\
		"movaps	    (%%rdi)	,%%xmm14	\n\t	subpd	%%xmm14	,%%xmm6		\n\t"/* ~b3r -= b0r */\
		"movaps	0x10(%%rdi)	,%%xmm15	\n\t	addpd	%%xmm15	,%%xmm7		\n\t"/* ~b3i += b0i */\
		/* now calculate 1st square-like term and store back in H(j) slot:
			t4 = a0r*b0r - a0i*b0i, a0i = a0r*b0i + a0i*b0r, a0r = t4
			t5 = a1r*b1r - a1i*b1i, a1i = a1r*b1i + a1i*b1r, a1r = t5
		*/\
		"movaps	%%xmm8	,%%xmm12		\n\t	mulpd	%%xmm14	,%%xmm8		\n\t"/* a0r*b0r */\
		"movaps	%%xmm9	,%%xmm13		\n\t	mulpd	%%xmm15	,%%xmm13	\n\t"/* a0i*b0i */\
		"										mulpd	%%xmm15	,%%xmm12	\n\t"/* a0r*b0i */\
		"										mulpd	%%xmm14	,%%xmm9		\n\t"/* a0i*b0r */\
		"subpd	%%xmm13	,%%xmm8			\n\t	addpd	%%xmm12	,%%xmm9		\n\t"	/* a0r,i in xmm8,9 */\
	/*** Consider overlapping these 2 cmul to better hide latency ***/\
		"movaps	    (%%rsi)	,%%xmm14	\n\t	subpd	%%xmm14	,%%xmm4		\n\t"/* ~b2r -= b1r */\
		"movaps	0x10(%%rsi)	,%%xmm15	\n\t	addpd	%%xmm15	,%%xmm5		\n\t"/* ~b2i += b1i */\
		"movaps	%%xmm10	,%%xmm12		\n\t	mulpd	%%xmm14	,%%xmm10	\n\t"/* a1r*b1r */\
		"movaps	%%xmm11	,%%xmm13		\n\t	mulpd	%%xmm15	,%%xmm13	\n\t"/* a1i*b1i */\
		"										mulpd	%%xmm15	,%%xmm12	\n\t"/* a1r*b1i */\
		"										mulpd	%%xmm14	,%%xmm11	\n\t"/* a1i*b1r */\
		"subpd	%%xmm13	,%%xmm10		\n\t	addpd	%%xmm12	,%%xmm11	\n\t"	/* a1r,i in xmm10,11 */\
	/* a0,1 in xmm8-11, a2,3 in xmm0-3, b2,3 in xmm4-7, t1,3 in (rcx), t0,2 in (rdx) */\
		/* calculate the complex products to build the second term:
			t4 = ~a3r*~b3r - ~a3i*~b3i, ~a3i = ~a3r*~b3i + ~a3i*~b3r, ~a3r,i in xmm2,3, ~b3r,i in xmm6,7
			t5 = ~a2r*~b2r - ~a2i*~b2i, ~a2i = ~a2r*~b2i + ~a2i*~b2r, ~arr,i in xmm0,1, ~b2r,i in xmm4,5
		*/\
/****************** a2r,a3r,b2r,b3r being negated means a2i,a3i come out negated ****************/\
		"movaps	%%xmm2	,%%xmm12		\n\t	mulpd	%%xmm6	,%%xmm2		\n\t"/* ~a3r*~b3r */\
		"movaps	%%xmm3	,%%xmm13		\n\t	mulpd	%%xmm7	,%%xmm13	\n\t"/* ~a3i*~b3i */\
		"										mulpd	%%xmm7	,%%xmm12	\n\t"/* ~a3r*~b3i */\
		"										mulpd	%%xmm6	,%%xmm3		\n\t"/* ~a3i*~b3r */\
		"subpd	%%xmm13	,%%xmm2			\n\t	addpd	%%xmm12	,%%xmm3		\n\t"	/* t4,~a3i in xmm2,3 */\
	/*** Consider overlapping these 2 cmul to better hide latency ***/\
		"movaps	%%xmm0	,%%xmm14		\n\t	mulpd	%%xmm4	,%%xmm0		\n\t"/* ~a2r*~b2r */\
		"movaps	%%xmm1	,%%xmm15		\n\t	mulpd	%%xmm5	,%%xmm15	\n\t"/* ~a2i*~b2i */\
		"										mulpd	%%xmm5	,%%xmm14	\n\t"/* ~a2r*~b2i */\
		"										mulpd	%%xmm4	,%%xmm1		\n\t"/* ~a2i*~b2r */\
		"subpd	%%xmm15	,%%xmm0			\n\t	addpd	%%xmm14	,%%xmm1		\n\t"	/* t5,~a2i in xmm0,1 */\
		/* xmm4-7,12-15 free */\
		/* Assume [c0,s1],[s0,c1] sincos vector-data are in the [c] and [s]-input-pointers, then compute
			~a3r = [cc+0.25]*t4 - [ss]*~a3i, ~a3i = [ss]*t4 + [cc+0.25]*~a3i
			~a2r = [0.25-ss]*t5 - [cc]*~a2i, ~a2i = [cc]*t5 + [0.25-ss]*~a2i ,
		where cc = 0.25*[c0,s1] and ss = 0.25*[s0,c1]:
		*/\
/****************** a2i,a3i being negated requires +- sign swap in this next computation ****************/\
		"movq	%[__forth],%%rdi		\n\t	movaps	(%%rdi),%%xmm6		\n\t	movaps	%%xmm6,%%xmm7	\n\t"/* 2 copies of 0.25 */\
		"movq	%[__c]	,%%rdi			\n\t	movaps	(%%rdi),%%xmm4		\n\t"/*	cc assumed premultiplied by 0.25 */\
		"movq	%[__s]	,%%rsi			\n\t	movaps	(%%rsi),%%xmm5		\n\t"/*	ss assumed premultiplied by 0.25 */\
		"addpd	%%xmm4	,%%xmm6			\n\t	subpd	%%xmm5	,%%xmm7		\n\t"	/* [cc+0.25],[0.25-ss] in xmm6,7 */\
		"movaps	%%xmm2	,%%xmm12		\n\t	mulpd	%%xmm6	,%%xmm2		\n\t"/*   t4*[cc+0.25] */\
		"movaps	%%xmm3	,%%xmm13		\n\t	mulpd	%%xmm5	,%%xmm13	\n\t"/* ~a3i*[ss] */\
		"										mulpd	%%xmm5	,%%xmm12	\n\t"/*   t4*[ss] */\
		"										mulpd	%%xmm6	,%%xmm3		\n\t"/* ~a3i*[cc+0.25] */\
		"addpd	%%xmm13	,%%xmm2			\n\t	subpd	%%xmm12	,%%xmm3		\n\t"	/* ~a3r = [cc+0.25]*t4 - [ss]*~a3i, ~a3i = [cc+0.25]*~a3i - [ss]*t4 in xmm2,3 */\
	/*** Consider overlapping these 2 cmul to better hide latency ***/\
		"movaps	%%xmm0	,%%xmm14		\n\t	mulpd	%%xmm7	,%%xmm0		\n\t"/*   t5*[0.25-ss] */\
		"movaps	%%xmm1	,%%xmm15		\n\t	mulpd	%%xmm4	,%%xmm15	\n\t"/* ~a2i*[cc] */\
		"										mulpd	%%xmm4	,%%xmm14	\n\t"/*   t5*[cc] */\
		"										mulpd	%%xmm7	,%%xmm1		\n\t"/* ~a2i*[0.25-ss] */\
		"addpd	%%xmm15	,%%xmm0			\n\t	subpd	%%xmm14	,%%xmm1		\n\t"	/* ~a2r = [0.25-ss]*t5 - [cc]*~a2i, ~a2i = [0.25-ss]*~a2i - [cc]*t5 in xmm0,1 */\
/****************** a2i,a3i negated ****************/\
	/* a0,1 in xmm8-11, a2,3 in xmm0-3, t1,3 in (rcx), t0,2 in (rdx) */\
		"movaps	    (%%rdx)	,%%xmm4		\n\t"/* t0 */\
		"movaps	0x10(%%rdx)	,%%xmm5		\n\t"/* t2 */\
		"movaps	    (%%rcx)	,%%xmm6		\n\t"/* t1 */\
		"movaps	0x10(%%rcx)	,%%xmm7		\n\t"/* t3 */\
	/* and now complete and store the results:
		a0r -= ~a3r, a0i -= ~a3i
		a1r -= ~a2r, a1i -= ~a2i
	N-j terms:
		~a3r = t0 - ~a3r, ~a3i += t2
		~a2r = t1 - ~a2r, ~a2i += t3
	*/\
/****************** a2i,a3i negated means in rcol instead computing a0,1i += ~a3,2i, a3,2i = t2,3 - a3,2i ****************/\
		"subpd	%%xmm2	,%%xmm8			\n\t	addpd	%%xmm3	,%%xmm9		\n\t"	/* a0r,i in xmm8,9 */\
		"subpd	%%xmm0	,%%xmm10		\n\t	addpd	%%xmm1	,%%xmm11	\n\t"	/* a1r,i in xmm10,11 */\
		"subpd	%%xmm2	,%%xmm4			\n\t	subpd	%%xmm3	,%%xmm5		\n\t"	/* ~a3r,i in xmm4,5 */\
		"subpd	%%xmm0	,%%xmm6			\n\t	subpd	%%xmm1	,%%xmm7		\n\t"	/* ~a2r,i in xmm6,7 */\
	/* Interleave writes of a0,a1 with un-shufflings of ~a2,~a3: */\
		"movaps	%%xmm8	,    (%%rax)	\n\t	shufpd	$1	,%%xmm4	,%%xmm4	\n\t"/* ~a3r */\
		"movaps	%%xmm9	,0x10(%%rax)	\n\t	shufpd	$1	,%%xmm5	,%%xmm5	\n\t"/* ~a3i */\
		"movaps	%%xmm10	,    (%%rbx)	\n\t	shufpd	$1	,%%xmm6	,%%xmm6	\n\t"/* ~a2r */\
		"movaps	%%xmm11	,0x10(%%rbx)	\n\t	shufpd	$1	,%%xmm7	,%%xmm7	\n\t"/* ~a2i */\
		"movaps	%%xmm4	,    (%%rdx)	\n\t"\
		"movaps	%%xmm5	,0x10(%%rdx)	\n\t"\
		"movaps	%%xmm6	,    (%%rcx)	\n\t"\
		"movaps	%%xmm7	,0x10(%%rcx)	\n\t"\
		/* Cost: [35 vector-load/store (0 implicit), 12 shufpd, 34 addpd, 32 mulpd, 21 vector-register-copy] */\
		:					/* outputs: none */\
		: [__A0] "m" (XA0)	/* All inputs from memory addresses here */\
		 ,[__A1] "m" (XA1)\
		 ,[__A2] "m" (XA2)\
		 ,[__A3] "m" (XA3)\
		 ,[__B0] "m" (XB0)\
		 ,[__B1] "m" (XB1)\
		 ,[__B2] "m" (XB2)\
		 ,[__B3] "m" (XB3)\
		 ,[__c] "m" (Xc)\
		 ,[__s] "m" (Xs)\
		 ,[__forth] "m" (Xforth)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_03_DFT(Xi0,Xi1,Xi2, Xcc1, Xo0,Xo1,Xo2)\
	{\
	__asm__ volatile (\
			"movq	%[__i0],%%rax		\n\t"\
			"movq	%[__i1],%%rbx		\n\t"\
			"movq	%[__i2],%%rcx		\n\t"\
			"movq	%[__cc1],%%rdx		\n\t"\
			"\n\t"\
			"movaps	    (%%rbx),%%xmm2	\n\t"\
			"movaps	0x10(%%rbx),%%xmm3	\n\t"\
			"movaps	    (%%rax),%%xmm0	\n\t"\
			"movaps	0x10(%%rax),%%xmm1	\n\t"\
			"movaps	    (%%rcx),%%xmm6	\n\t"\
			"movaps	0x10(%%rcx),%%xmm7	\n\t"\
			"movaps	%%xmm2,%%xmm4		\n\t"\
			"movaps	%%xmm3,%%xmm5		\n\t"\
			"\n\t"\
			"movq	%[__o0],%%rax		\n\t"\
			"movq	%[__o1],%%rbx		\n\t"\
			"movq	%[__o2],%%rcx		\n\t"\
			"addpd	%%xmm6,%%xmm2		\n\t"\
			"addpd	%%xmm7,%%xmm3		\n\t"\
			"subpd	%%xmm6,%%xmm4		\n\t"\
			"subpd	%%xmm7,%%xmm5		\n\t"\
			"addpd	%%xmm2,%%xmm0		\n\t"\
			"addpd	%%xmm3,%%xmm1		\n\t"\
			"movaps	    (%%rdx),%%xmm6	\n\t"\
			"movaps	0x10(%%rdx),%%xmm7	\n\t"\
			"movaps	%%xmm0,    (%%rax)	\n\t"\
			"movaps	%%xmm1,0x10(%%rax)	\n\t"\
			"\n\t"\
			"mulpd	%%xmm6,%%xmm2		\n\t"\
			"mulpd	%%xmm6,%%xmm3		\n\t"\
			"mulpd	%%xmm7,%%xmm4		\n\t"\
			"mulpd	%%xmm7,%%xmm5		\n\t"\
			"addpd	%%xmm0,%%xmm2		\n\t"\
			"addpd	%%xmm1,%%xmm3		\n\t"\
			"\n\t"\
			"movaps	%%xmm2,%%xmm0		\n\t"\
			"movaps	%%xmm3,%%xmm1		\n\t"\
			"\n\t"\
			"subpd	%%xmm5,%%xmm2		\n\t"\
			"addpd	%%xmm4,%%xmm3		\n\t"\
			"addpd	%%xmm5,%%xmm0		\n\t"\
			"subpd	%%xmm4,%%xmm1		\n\t"\
			"\n\t"\
			"movaps	%%xmm2,    (%%rbx)	\n\t"\
			"movaps	%%xmm3,0x10(%%rbx)	\n\t"\
			"movaps	%%xmm0,    (%%rcx)	\n\t"\
			"movaps	%%xmm1,0x10(%%rcx)	\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_03_DFT_X2(Xcc0, Xi0,Xi1,Xi2, Xo0,Xo1,Xo2, Xj0,Xj1,Xj2, Xu0,Xu1,Xu2)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax		\n\t	movq	%[__j0],%%r10		\n\t"\
		"movq	%[__i1],%%rbx		\n\t	movq	%[__j1],%%r11		\n\t"\
		"movq	%[__i2],%%rcx		\n\t	movq	%[__j2],%%r12		\n\t"\
		"movq	%[__cc0],%%rdx		\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t	movaps	    (%%r11),%%xmm10	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t	movaps	0x10(%%r11),%%xmm11	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t	movaps	    (%%r10),%%xmm8 	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t	movaps	0x10(%%r10),%%xmm9 	\n\t"\
		"movaps	    (%%rcx),%%xmm6	\n\t	movaps	    (%%r12),%%xmm14	\n\t"\
		"movaps	0x10(%%rcx),%%xmm7	\n\t	movaps	0x10(%%r12),%%xmm15	\n\t"\
		"movaps	%%xmm2,%%xmm4		\n\t	movaps	%%xmm10,%%xmm12		\n\t"\
		"movaps	%%xmm3,%%xmm5		\n\t	movaps	%%xmm11,%%xmm13		\n\t"\
		"movq	%[__o0],%%rax		\n\t	movq	%[__u0],%%r10		\n\t"\
		"movq	%[__o1],%%rbx		\n\t	movq	%[__u1],%%r11		\n\t"\
		"movq	%[__o2],%%rcx		\n\t	movq	%[__u2],%%r12		\n\t"\
		"addpd	%%xmm6,%%xmm2		\n\t	addpd	%%xmm14,%%xmm10		\n\t"\
		"addpd	%%xmm7,%%xmm3		\n\t	addpd	%%xmm15,%%xmm11		\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t	subpd	%%xmm14,%%xmm12		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t	subpd	%%xmm15,%%xmm13		\n\t"\
		"addpd	%%xmm2,%%xmm0		\n\t	addpd	%%xmm10,%%xmm8 		\n\t"\
		"addpd	%%xmm3,%%xmm1		\n\t	addpd	%%xmm11,%%xmm9 		\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"movaps	%%xmm0,     (%%rax)	\n\t	movaps	%%xmm8 ,     (%%r10)\n\t"\
		"movaps	%%xmm1,0x010(%%rax)	\n\t	movaps	%%xmm9 ,0x010(%%r10)\n\t"\
		"mulpd	%%xmm6,%%xmm2		\n\t	mulpd	%%xmm6 ,%%xmm10		\n\t"\
		"mulpd	%%xmm6,%%xmm3		\n\t	mulpd	%%xmm6 ,%%xmm11		\n\t"\
		"mulpd	%%xmm7,%%xmm4		\n\t	mulpd	%%xmm7 ,%%xmm12		\n\t"\
		"mulpd	%%xmm7,%%xmm5		\n\t	mulpd	%%xmm7 ,%%xmm13		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t	addpd	%%xmm8 ,%%xmm10		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t	addpd	%%xmm9 ,%%xmm11		\n\t"\
		"movaps	%%xmm2,%%xmm0		\n\t	movaps	%%xmm10,%%xmm8 		\n\t"\
		"movaps	%%xmm3,%%xmm1		\n\t	movaps	%%xmm11,%%xmm9 		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t	subpd	%%xmm13,%%xmm10		\n\t"\
		"addpd	%%xmm4,%%xmm3		\n\t	addpd	%%xmm12,%%xmm11		\n\t"\
		"addpd	%%xmm5,%%xmm0		\n\t	addpd	%%xmm13,%%xmm8 		\n\t"\
		"subpd	%%xmm4,%%xmm1		\n\t	subpd	%%xmm12,%%xmm9 		\n\t"\
		"movaps	%%xmm2,     (%%rbx)	\n\t	movaps	%%xmm10,     (%%r11)\n\t"\
		"movaps	%%xmm3,0x010(%%rbx)	\n\t	movaps	%%xmm11,0x010(%%r11)\n\t"\
		"movaps	%%xmm0,     (%%rcx)	\n\t	movaps	%%xmm8 ,     (%%r12)\n\t"\
		"movaps	%%xmm1,0x010(%%rcx)	\n\t	movaps	%%xmm9 ,0x010(%%r12)\n\t"\
		:					/* outputs: none */\
		: [__cc0] "m" (Xcc0)	/* All inputs from memory addresses here */\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		: "cc","memory","rax","rbx","rcx","rdx","r10","r11","r12","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX4_DIF_0TWIDDLE_STRIDE(Xadd0, Xadd1, Xadd2, Xadd3, Xtmp, Xstride)\
	{\
	__asm__ volatile (\
		"movq	%[__tmp]   ,%%rax	\n\t"\
		"movq	%[__stride],%%rsi	\n\t"\
		"movq	%%rax,%%rbx			\n\t"\
		"addq	%%rsi,%%rbx			/* add_in1  */\n\t"\
		"shlq	$1,%%rsi			/* stride*2 */\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rax),%%xmm4	\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t"\
		"movaps	0x10(%%rax),%%xmm5	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t"\
		"addq	%%rsi,%%rax			/* add_in2  */\n\t"\
		"addq	%%rsi,%%rbx			/* add_in3  */\n\t"\
		"addpd	    (%%rax),%%xmm0	\n\t"\
		"addpd	    (%%rbx),%%xmm2	\n\t"\
		"addpd	0x10(%%rax),%%xmm1	\n\t"\
		"addpd	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	    (%%rax),%%xmm4	\n\t"\
		"subpd	    (%%rbx),%%xmm6	\n\t"\
		"subpd	0x10(%%rax),%%xmm5	\n\t"\
		"subpd	0x10(%%rbx),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into main-array slots: */\n\t"\
		"movq	%[__add0],%%rax		\n\t"\
		"movq	%[__add1],%%rbx		\n\t"\
		"movq	%[__add2],%%rcx		\n\t"\
		"movq	%[__add3],%%rdx		\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"subpd	%%xmm6,%%xmm5		\n\t"\
		"movaps	%%xmm0,     (%%rbx)	\n\t"\
		"movaps	%%xmm4,     (%%rcx)	\n\t"\
		"movaps	%%xmm1,0x010(%%rbx)	\n\t"\
		"movaps	%%xmm5,0x010(%%rdx)	\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm4,%%xmm7		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm5,%%xmm6		\n\t"\
		"movaps	%%xmm2,     (%%rax)	\n\t"\
		"movaps	%%xmm7,     (%%rdx)	\n\t"\
		"movaps	%%xmm3,0x010(%%rax)	\n\t"\
		"movaps	%%xmm6,0x010(%%rcx)	\n\t"\
		:					/* outputs: none */\
		: [__add0] "m" (Xadd0)	/* All inputs from memory addresses here */\
		 ,[__add1] "m" (Xadd1)\
		 ,[__add2] "m" (Xadd2)\
		 ,[__add3] "m" (Xadd3)\
		 ,[__tmp] "m" (Xtmp)\
		 ,[__stride] "e" (Xstride)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	/* DIF radix-4 subconvolution, sans twiddles, inputs in __i0-3, outputs in __o0-3, possibly coincident with inputs: */
	#define SSE2_RADIX4_DIF_0TWIDDLE_STRIDE_E(Xi0,Xi1,Xi2,Xi3, Xo0,Xo1,Xo2,Xo3)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax		\n\t"\
		"movq	%[__i1],%%rbx		\n\t"\
		"movq	%[__i2],%%rcx		\n\t"\
		"movq	%[__i3],%%rdx		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rbx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rbx),%%xmm5	\n\t"\
		"movaps	%%xmm0,%%xmm2	\n\t"\
		"movaps	%%xmm4,%%xmm6	\n\t"\
		"movaps	%%xmm1,%%xmm3	\n\t"\
		"movaps	%%xmm5,%%xmm7	\n\t"\
		"addpd	    (%%rcx),%%xmm0	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rcx),%%xmm1	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rcx),%%xmm2	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rcx),%%xmm3	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into main-array slots: */\n\t"\
		"movq	%[__o0],%%rax		\n\t"\
		"movq	%[__o1],%%rbx		\n\t"\
		"movq	%[__o2],%%rcx		\n\t"\
		"movq	%[__o3],%%rdx		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"subpd	%%xmm6,%%xmm3		\n\t"\
		"movaps	%%xmm0,    (%%rbx)	\n\t"\
		"movaps	%%xmm2,    (%%rcx)	\n\t"\
		"movaps	%%xmm1,0x10(%%rbx)	\n\t"\
		"movaps	%%xmm3,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm0,%%xmm4		\n\t"\
		"addpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm1,%%xmm5		\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t"\
		"movaps	%%xmm4,    (%%rax)	\n\t"\
		"movaps	%%xmm7,    (%%rdx)	\n\t"\
		"movaps	%%xmm5,0x10(%%rax)	\n\t"\
		"movaps	%%xmm6,0x10(%%rcx)	\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX4_DIT_0TWIDDLE_STRIDE(Xadd0, Xadd1, Xadd2, Xadd3, Xtmp, Xstride)\
	{\
	__asm__ volatile (\
		"movq	%[__add0],%%rax		\n\t"\
		"movq	%[__add1],%%rbx		\n\t"\
		"movq	%[__add2],%%rcx		\n\t"\
		"movq	%[__add3],%%rdx		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	%%xmm0,%%xmm2			\n\t"\
		"movaps	%%xmm4,%%xmm6			\n\t"\
		"movaps	%%xmm1,%%xmm3			\n\t"\
		"movaps	%%xmm5,%%xmm7			\n\t"\
		"movq	%[__tmp]   ,%%rax	\n\t"\
		"movq	%[__stride],%%rcx	\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"movq	%%rax,%%rbx			\n\t"\
		"addq	%%rcx,%%rbx			\n\t"\
		"movq	%%rbx,%%rdx			\n\t"\
		"addq	%%rcx,%%rcx			\n\t"\
		"addq	%%rcx,%%rdx			\n\t"\
		"addq	%%rax,%%rcx			\n\t"\
		"/* Finish radix-4 butterfly and store results into temp-array slots: */\n\t"\
		"subpd	%%xmm4,%%xmm0			\n\t"\
		"subpd	%%xmm7,%%xmm2			\n\t"\
		"subpd	%%xmm5,%%xmm1			\n\t"\
		"subpd	%%xmm6,%%xmm3			\n\t"\
		"movaps	%%xmm0,     (%%rcx)	\n\t"\
		"movaps	%%xmm2,     (%%rdx)	\n\t"\
		"movaps	%%xmm1,0x010(%%rcx)	\n\t"\
		"movaps	%%xmm3,0x010(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4			\n\t"\
		"addpd	%%xmm7,%%xmm7			\n\t"\
		"addpd	%%xmm5,%%xmm5			\n\t"\
		"addpd	%%xmm6,%%xmm6			\n\t"\
		"addpd	%%xmm0,%%xmm4			\n\t"\
		"addpd	%%xmm2,%%xmm7			\n\t"\
		"addpd	%%xmm1,%%xmm5			\n\t"\
		"addpd	%%xmm3,%%xmm6			\n\t"\
		"movaps	%%xmm4,     (%%rax)	\n\t"\
		"movaps	%%xmm7,     (%%rbx)	\n\t"\
		"movaps	%%xmm5,0x010(%%rax)	\n\t"\
		"movaps	%%xmm6,0x010(%%rdx)	\n\t"\
		:					/* outputs: none */\
		: [__add0] "m" (Xadd0)	/* All inputs from memory addresses here */\
		 ,[__add1] "m" (Xadd1)\
		 ,[__add2] "m" (Xadd2)\
		 ,[__add3] "m" (Xadd3)\
		 ,[__tmp] "m" (Xtmp)\
		 ,[__stride] "e" (Xstride)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	/* DIT radix-4 subconvolution, sans twiddles, inputs in __i0-3, outputs in __o0-3, possibly coincident with inputs: */
	#define SSE2_RADIX4_DIT_0TWIDDLE_STRIDE_E(Xi0,Xi1,Xi2,Xi3, Xo0,Xo1,Xo2,Xo3)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rax		\n\t"\
		"movq	%[__i1],%%rbx		\n\t"\
		"movq	%[__i2],%%rcx		\n\t"\
		"movq	%[__i3],%%rdx		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	%%xmm0,%%xmm2			\n\t"\
		"movaps	%%xmm4,%%xmm6			\n\t"\
		"movaps	%%xmm1,%%xmm3			\n\t"\
		"movaps	%%xmm5,%%xmm7			\n\t"\
		"addpd	    (%%rbx),%%xmm0	\n\t"\
		"addpd	    (%%rdx),%%xmm4	\n\t"\
		"addpd	0x10(%%rbx),%%xmm1	\n\t"\
		"addpd	0x10(%%rdx),%%xmm5	\n\t"\
		"subpd	    (%%rbx),%%xmm2	\n\t"\
		"subpd	    (%%rdx),%%xmm6	\n\t"\
		"subpd	0x10(%%rbx),%%xmm3	\n\t"\
		"subpd	0x10(%%rdx),%%xmm7	\n\t"\
		"/* Finish radix-4 butterfly and store results into output-array slots: */\n\t"\
		"movq	%[__o0],%%rax		\n\t"\
		"movq	%[__o1],%%rbx		\n\t"\
		"movq	%[__o2],%%rcx		\n\t"\
		"movq	%[__o3],%%rdx		\n\t"\
		"subpd	%%xmm4,%%xmm0			\n\t"\
		"subpd	%%xmm7,%%xmm2			\n\t"\
		"subpd	%%xmm5,%%xmm1			\n\t"\
		"subpd	%%xmm6,%%xmm3			\n\t"\
		"movaps	%%xmm0,     (%%rcx)	\n\t"\
		"movaps	%%xmm2,     (%%rdx)	\n\t"\
		"movaps	%%xmm1,0x010(%%rcx)	\n\t"\
		"movaps	%%xmm3,0x010(%%rbx)	\n\t"\
		"addpd	%%xmm4,%%xmm4			\n\t"\
		"addpd	%%xmm7,%%xmm7			\n\t"\
		"addpd	%%xmm5,%%xmm5			\n\t"\
		"addpd	%%xmm6,%%xmm6			\n\t"\
		"addpd	%%xmm0,%%xmm4			\n\t"\
		"addpd	%%xmm2,%%xmm7			\n\t"\
		"addpd	%%xmm1,%%xmm5			\n\t"\
		"addpd	%%xmm3,%%xmm6			\n\t"\
		"movaps	%%xmm4,     (%%rax)	\n\t"\
		"movaps	%%xmm7,     (%%rbx)	\n\t"\
		"movaps	%%xmm5,0x010(%%rax)	\n\t"\
		"movaps	%%xmm6,0x010(%%rdx)	\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		: "cc","memory","rax","rbx","rcx","rdx","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	// May 2016: Used new macro-timing-loop code in util.c to get this down to ~60 cycles for two complex 4-DFTs.
	// X[i|odiff] contain the bytewise address offset between the pointers to the first and second DFT's data.
	// Two versions of the resulting macro: _X1 has both DFTs (instructions columns) sharing same twiddles
	// (thus no 'roff' twiddle-sets-byte-offset argument), _X2 has separate twiddles for each:
	#define SSE2_RADIX_04_DIF_3TWIDDLE_X1(Xin0,Xin1,Xin2,Xin3,Xidiff, Xtwo,Xcc0, Xout0,Xout1,Xout2,Xout3,Xodiff)\
	{\
	__asm__ volatile (\
		/* SIMD opcount: 67 MEM (= MOVAPS from/to memory), 44 ADDPD, 32 MULPD */\
		"movq	%[__cc0],%%rsi 		\n\t	movslq	%[__idiff],%%rdi	\n\t"\
		"movq	%[__in0],%%rax		\n\t"\
		"movq	%[__in1],%%rbx		\n\t	movslq	%[__odiff],%%r9		\n\t"\
		"movq	%[__in2],%%rcx		\n\t	movq	%[__out0],%%r10		\n\t"\
		"movq	%[__in3],%%rdx		\n\t	movq	%[__out1],%%r11		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t	movq	%[__out2],%%r12		\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t	movq	%[__out3],%%r13		\n\t"\
		/* Do	the p0,p2 combo: */\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rsi),%%xmm10	\n\t"\
		"movaps	0x10(%%rsi),%%xmm11	\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t	movaps	    (%%rcx,%%rdi),%%xmm12	\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t	movaps	0x10(%%rcx,%%rdi),%%xmm13	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"mulpd	%%xmm10,%%xmm4		\n\t	movaps	%%xmm12,%%xmm14		\n\t"\
		"mulpd	%%xmm10,%%xmm5		\n\t	movaps	%%xmm13,%%xmm15		\n\t"\
		"mulpd	%%xmm11,%%xmm6		\n\t	movaps	    (%%rax,%%rdi),%%xmm8 	\n\t	movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	%%xmm11,%%xmm7		\n\t	movaps	0x10(%%rax,%%rdi),%%xmm9 	\n\t	movaps	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t	mulpd	%%xmm10,%%xmm12		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t	mulpd	%%xmm10,%%xmm13		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t	mulpd	%%xmm11,%%xmm14		\n\t	movaps	%%xmm8 ,%%xmm10		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t	mulpd	%%xmm11,%%xmm15		\n\t	movaps	%%xmm9 ,%%xmm11		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t	addpd	%%xmm14,%%xmm13		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t	subpd	%%xmm15,%%xmm12		\n\t"\
		"movaps	%%xmm0,    (%%r10)	\n\t	addpd	%%xmm12,%%xmm8 		\n\t"/* Spill 1: free up xmm0,1 */\
		"movaps	%%xmm1,0x10(%%r10)	\n\t	addpd	%%xmm13,%%xmm9 		\n\t"/* Do	the p1,3 combo: */\
		"movaps	0x40(%%rsi),%%xmm0	\n\t	subpd	%%xmm12,%%xmm10		\n\t"\
		"movaps	0x50(%%rsi),%%xmm1	\n\t	subpd	%%xmm13,%%xmm11		\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t	movaps	%%xmm8 ,    (%%r10,%%r9)	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t	movaps	%%xmm9 ,0x10(%%r10,%%r9)	\n\t"\
		"movaps	%%xmm6,%%xmm4		\n\t"\
		"movaps	%%xmm7,%%xmm5		\n\t"\
		"mulpd	%%xmm0,%%xmm4		\n\t	movaps	    (%%rdx,%%rdi),%%xmm14	\n\t"\
		"mulpd	%%xmm0,%%xmm5		\n\t	movaps	0x10(%%rdx,%%rdi),%%xmm15	\n\t"\
		"mulpd	%%xmm1,%%xmm6		\n\t	movaps	%%xmm14,%%xmm12		\n\t"\
		"mulpd	%%xmm1,%%xmm7		\n\t	movaps	%%xmm15,%%xmm13		\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t	mulpd	%%xmm0 ,%%xmm12		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t	mulpd	%%xmm0 ,%%xmm13		\n\t"\
		"movaps	%%xmm5,0x10(%%r12)	\n\t	mulpd	%%xmm1 ,%%xmm14		\n\t"/* Spill 2 */\
		"movaps	%%xmm4,    (%%r12)	\n\t	mulpd	%%xmm1 ,%%xmm15		\n\t"\
		"movaps	0x20(%%rsi),%%xmm8	\n\t	addpd	%%xmm14,%%xmm13		\n\t"\
		"movaps	0x30(%%rsi),%%xmm9	\n\t	subpd	%%xmm15,%%xmm12		\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t	movaps	%%xmm13,0x10(%%r12,%%r9)	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t	movaps	%%xmm12,    (%%r12,%%r9)	\n\t"\
		"movaps	%%xmm6,%%xmm4		\n\t"\
		"movaps	%%xmm7,%%xmm5		\n\t"\
		"mulpd	%%xmm8,%%xmm4		\n\t	movaps	    (%%rbx,%%rdi),%%xmm14	\n\t"\
		"mulpd	%%xmm8,%%xmm5		\n\t	movaps	0x10(%%rbx,%%rdi),%%xmm15	\n\t"\
		"mulpd	%%xmm9,%%xmm6		\n\t	movaps	%%xmm14,%%xmm12		\n\t"\
		"mulpd	%%xmm9,%%xmm7		\n\t	movaps	%%xmm15,%%xmm13		\n\t"\
		"movaps	    (%%r12),%%xmm0	\n\t	mulpd	%%xmm8 ,%%xmm12		\n\t"/* Restore 2 */\
		"movaps	0x10(%%r12),%%xmm1	\n\t	mulpd	%%xmm8 ,%%xmm13		\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t	mulpd	%%xmm9 ,%%xmm14		\n\t	 movq	%[__two],%%rsi	\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t	mulpd	%%xmm9 ,%%xmm15		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t	movaps	    (%%r12,%%r9),%%xmm8 	\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t	movaps	0x10(%%r12,%%r9),%%xmm9 	\n\t"\
		"subpd	%%xmm0,%%xmm4		\n\t	addpd	%%xmm14,%%xmm13		\n\t"\
		"subpd	%%xmm1,%%xmm5		\n\t	subpd	%%xmm15,%%xmm12		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t	movaps	%%xmm13,%%xmm15		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t	movaps	%%xmm12,%%xmm14		\n\t"\
		/* Finish radix-4 butterfly and store results: */\
		"movaps	    (%%r10),%%xmm0	\n\t	subpd	%%xmm8 ,%%xmm12		\n\t"/* Restore 1 */\
		"movaps	0x10(%%r10),%%xmm1	\n\t	subpd	%%xmm9 ,%%xmm13		\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t	addpd	%%xmm8 ,%%xmm14		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t	addpd	%%xmm9 ,%%xmm15		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t	movaps	    (%%r10,%%r9),%%xmm8 	\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t	movaps	0x10(%%r10,%%r9),%%xmm9 	\n\t"\
		"movaps	%%xmm0,    (%%r12)	\n\t	subpd	%%xmm14,%%xmm8 		\n\t"	/* 2.0, shared by both columns ... moved +- until found best cycle count: */\
		"movaps	%%xmm2,    (%%r11)	\n\t	subpd	%%xmm13,%%xmm10		\n\t	movaps	(%%rsi),%%xmm0	\n\t"\
		"movaps	%%xmm1,0x10(%%r12)	\n\t	subpd	%%xmm15,%%xmm9 		\n\t"\
		"movaps	%%xmm3,0x10(%%r13)	\n\t	subpd	%%xmm12,%%xmm11		\n\t"\
		"mulpd	%%xmm0,%%xmm6		\n\t	movaps	%%xmm8 ,    (%%r12,%%r9)	\n\t"\
		"mulpd	%%xmm0,%%xmm5		\n\t	movaps	%%xmm10,    (%%r11,%%r9)	\n\t"\
		"mulpd	%%xmm0,%%xmm7		\n\t	movaps	%%xmm9 ,0x10(%%r12,%%r9)	\n\t"\
		"mulpd	%%xmm0,%%xmm4		\n\t	movaps	%%xmm11,0x10(%%r13,%%r9)	\n\t"\
		"addpd	    (%%r12),%%xmm6	\n\t	mulpd	%%xmm0 ,%%xmm14		\n\t"\
		"addpd		%%xmm2 ,%%xmm5	\n\t	mulpd	%%xmm0 ,%%xmm13		\n\t"\
		"addpd		%%xmm1 ,%%xmm7	\n\t	mulpd	%%xmm0 ,%%xmm15		\n\t"\
		"addpd		%%xmm3 ,%%xmm4	\n\t	mulpd	%%xmm0 ,%%xmm12		\n\t"\
		"movaps	%%xmm6,    (%%r10)	\n\t	addpd	    (%%r12,%%r9),%%xmm14	\n\t"/* don't need reload-from-mem of xmm8/0xc0(%%r12,%%r9) as we do in lcol, but 1 cycle faster with it. [!?] */\
		"movaps	%%xmm5,    (%%r13)	\n\t	addpd		%%xmm10,%%xmm13	\n\t"\
		"movaps	%%xmm7,0x10(%%r10)	\n\t	addpd		%%xmm9 ,%%xmm15	\n\t"\
		"movaps	%%xmm4,0x10(%%r11)	\n\t	addpd		%%xmm11,%%xmm12	\n\t"\
		"									movaps	%%xmm14,    (%%r10,%%r9)	\n\t"\
		"									movaps	%%xmm13,    (%%r13,%%r9)	\n\t"\
		"									movaps	%%xmm15,0x10(%%r10,%%r9)	\n\t"\
		"									movaps	%%xmm12,0x10(%%r11,%%r9)	\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__in1] "m" (Xin1)\
		 ,[__in2] "m" (Xin2)\
		 ,[__in3] "m" (Xin3)\
		/* idiff, 'input-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's inputs */\
		 ,[__idiff] "m" (Xidiff)\
		 ,[__two] "m" (Xtwo)	/* pointer to vector-const 2.0 */\
		 ,[__cc0] "m" (Xcc0)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		/* odiff, 'output-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's outputs */\
		 ,[__odiff] "m" (Xodiff)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_04_DIF_3TWIDDLE_X2(Xin0,Xin1,Xin2,Xin3,Xidiff, Xtwo,Xcc0,Xroff, Xout0,Xout1,Xout2,Xout3,Xodiff)\
	{\
	__asm__ volatile (\
		/* SIMD opcount: 67 MEM (= MOVAPS from/to memory), 44 ADDPD, 32 MULPD */\
		"movq	%[__cc0],%%rsi 		\n\t	movslq	%[__idiff],%%rdi	\n\t"\
		"movq	%[__in0],%%rax		\n\t	leaq %c[__roff](%%rsi),%%r8	\n\t"\
		"movq	%[__in1],%%rbx		\n\t	movslq	%[__odiff],%%r9		\n\t"\
		"movq	%[__in2],%%rcx		\n\t	movq	%[__out0],%%r10		\n\t"\
		"movq	%[__in3],%%rdx		\n\t	movq	%[__out1],%%r11		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t	movq	%[__out2],%%r12		\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t	movq	%[__out3],%%r13		\n\t"\
		/* Do	the p0,p2 combo: */\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rsi),%%xmm2	\n\t"\
		"movaps	0x10(%%rsi),%%xmm3	\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t	movaps	    (%%rcx,%%rdi),%%xmm12	\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t	movaps	0x10(%%rcx,%%rdi),%%xmm13	\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t	movaps	    (%%r8),%%xmm10	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t	movaps	0x10(%%r8),%%xmm11	\n\t"\
		"mulpd	%%xmm2,%%xmm4		\n\t	movaps	%%xmm12,%%xmm14		\n\t"\
		"mulpd	%%xmm2,%%xmm5		\n\t	movaps	%%xmm13,%%xmm15		\n\t"\
		"mulpd	%%xmm3,%%xmm6		\n\t	movaps	    (%%rax,%%rdi),%%xmm8 	\n\t	movaps	%%xmm0,%%xmm2		\n\t"\
		"mulpd	%%xmm3,%%xmm7		\n\t	movaps	0x10(%%rax,%%rdi),%%xmm9 	\n\t	movaps	%%xmm1,%%xmm3		\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t	mulpd	%%xmm10,%%xmm12		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t	mulpd	%%xmm10,%%xmm13		\n\t"\
		"addpd	%%xmm4,%%xmm0		\n\t	mulpd	%%xmm11,%%xmm14		\n\t	movaps	%%xmm8 ,%%xmm10		\n\t"\
		"addpd	%%xmm5,%%xmm1		\n\t	mulpd	%%xmm11,%%xmm15		\n\t	movaps	%%xmm9 ,%%xmm11		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t	addpd	%%xmm14,%%xmm13		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t	subpd	%%xmm15,%%xmm12		\n\t"\
		"movaps	%%xmm0,    (%%r10)	\n\t	addpd	%%xmm12,%%xmm8 		\n\t"/* Spill 1: free up xmm0,1 */\
		"movaps	%%xmm1,0x10(%%r10)	\n\t	addpd	%%xmm13,%%xmm9 		\n\t"/* Do	the p1,3 combo: */\
		"movaps	0x40(%%rsi),%%xmm0	\n\t	subpd	%%xmm12,%%xmm10		\n\t"\
		"movaps	0x50(%%rsi),%%xmm1	\n\t	subpd	%%xmm13,%%xmm11		\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t	movaps	%%xmm8 ,    (%%r10,%%r9)	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t	movaps	%%xmm9 ,0x10(%%r10,%%r9)	\n\t"\
		"movaps	%%xmm6,%%xmm4		\n\t	movaps	0x40(%%r8),%%xmm8 	\n\t"\
		"movaps	%%xmm7,%%xmm5		\n\t	movaps	0x50(%%r8),%%xmm9 	\n\t"\
		"mulpd	%%xmm0,%%xmm4		\n\t	movaps	    (%%rdx,%%rdi),%%xmm14	\n\t"\
		"mulpd	%%xmm0,%%xmm5		\n\t	movaps	0x10(%%rdx,%%rdi),%%xmm15	\n\t"\
		"mulpd	%%xmm1,%%xmm6		\n\t	movaps	%%xmm14,%%xmm12		\n\t"\
		"mulpd	%%xmm1,%%xmm7		\n\t	movaps	%%xmm15,%%xmm13		\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t	mulpd	%%xmm8 ,%%xmm12		\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t	mulpd	%%xmm8 ,%%xmm13		\n\t"\
		"movaps	%%xmm5,0x10(%%r12)	\n\t	mulpd	%%xmm9 ,%%xmm14		\n\t"/* Spill 2 */\
		"movaps	%%xmm4,    (%%r12)	\n\t	mulpd	%%xmm9 ,%%xmm15		\n\t"\
		"movaps	0x20(%%rsi),%%xmm0	\n\t	addpd	%%xmm14,%%xmm13		\n\t"\
		"movaps	0x30(%%rsi),%%xmm1	\n\t	subpd	%%xmm15,%%xmm12		\n\t"\
		"movaps	    (%%rbx),%%xmm6	\n\t	movaps	%%xmm13,0x10(%%r12,%%r9)	\n\t"\
		"movaps	0x10(%%rbx),%%xmm7	\n\t	movaps	%%xmm12,    (%%r12,%%r9)	\n\t"\
		"movaps	%%xmm6,%%xmm4		\n\t	movaps	0x20(%%r8),%%xmm8 	\n\t"\
		"movaps	%%xmm7,%%xmm5		\n\t	movaps	0x30(%%r8),%%xmm9 	\n\t"\
		"mulpd	%%xmm0,%%xmm4		\n\t	movaps	    (%%rbx,%%rdi),%%xmm14	\n\t"\
		"mulpd	%%xmm0,%%xmm5		\n\t	movaps	0x10(%%rbx,%%rdi),%%xmm15	\n\t"\
		"mulpd	%%xmm1,%%xmm6		\n\t	movaps	%%xmm14,%%xmm12		\n\t"\
		"mulpd	%%xmm1,%%xmm7		\n\t	movaps	%%xmm15,%%xmm13		\n\t"\
		"movaps	    (%%r12),%%xmm0	\n\t	mulpd	%%xmm8 ,%%xmm12		\n\t"/* Restore 2 */\
		"movaps	0x10(%%r12),%%xmm1	\n\t	mulpd	%%xmm8 ,%%xmm13		\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t	mulpd	%%xmm9 ,%%xmm14		\n\t	 movq	%[__two],%%rsi	\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t	mulpd	%%xmm9 ,%%xmm15		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t	movaps	    (%%r12,%%r9),%%xmm8 	\n\t"\
		"movaps	%%xmm4,%%xmm6		\n\t	movaps	0x10(%%r12,%%r9),%%xmm9 	\n\t"\
		"subpd	%%xmm0,%%xmm4		\n\t	addpd	%%xmm14,%%xmm13		\n\t"\
		"subpd	%%xmm1,%%xmm5		\n\t	subpd	%%xmm15,%%xmm12		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t	movaps	%%xmm13,%%xmm15		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t	movaps	%%xmm12,%%xmm14		\n\t"\
		/* Finish radix-4 butterfly and store results: */\
		"movaps	    (%%r10),%%xmm0	\n\t	subpd	%%xmm8 ,%%xmm12		\n\t"/* Restore 1 */\
		"movaps	0x10(%%r10),%%xmm1	\n\t	subpd	%%xmm9 ,%%xmm13		\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t	addpd	%%xmm8 ,%%xmm14		\n\t"\
		"subpd	%%xmm5,%%xmm2		\n\t	addpd	%%xmm9 ,%%xmm15		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t	movaps	    (%%r10,%%r9),%%xmm8 	\n\t"\
		"subpd	%%xmm4,%%xmm3		\n\t	movaps	0x10(%%r10,%%r9),%%xmm9 	\n\t"\
		"movaps	%%xmm0,    (%%r12)	\n\t	subpd	%%xmm14,%%xmm8 		\n\t"	/* 2.0, shared by both columns ... moved +- until found best cycle count: */\
		"movaps	%%xmm2,    (%%r11)	\n\t	subpd	%%xmm13,%%xmm10		\n\t	movaps	(%%rsi),%%xmm0	\n\t"\
		"movaps	%%xmm1,0x10(%%r12)	\n\t	subpd	%%xmm15,%%xmm9 		\n\t"\
		"movaps	%%xmm3,0x10(%%r13)	\n\t	subpd	%%xmm12,%%xmm11		\n\t"\
		"mulpd	%%xmm0,%%xmm6		\n\t	movaps	%%xmm8 ,    (%%r12,%%r9)	\n\t"\
		"mulpd	%%xmm0,%%xmm5		\n\t	movaps	%%xmm10,    (%%r11,%%r9)	\n\t"\
		"mulpd	%%xmm0,%%xmm7		\n\t	movaps	%%xmm9 ,0x10(%%r12,%%r9)	\n\t"\
		"mulpd	%%xmm0,%%xmm4		\n\t	movaps	%%xmm11,0x10(%%r13,%%r9)	\n\t"\
		"addpd	    (%%r12),%%xmm6	\n\t	mulpd	%%xmm0 ,%%xmm14		\n\t"\
		"addpd		%%xmm2 ,%%xmm5	\n\t	mulpd	%%xmm0 ,%%xmm13		\n\t"\
		"addpd		%%xmm1 ,%%xmm7	\n\t	mulpd	%%xmm0 ,%%xmm15		\n\t"\
		"addpd		%%xmm3 ,%%xmm4	\n\t	mulpd	%%xmm0 ,%%xmm12		\n\t"\
		"movaps	%%xmm6,    (%%r10)	\n\t	addpd	    (%%r12,%%r9),%%xmm14	\n\t"/* don't need reload-from-mem of xmm8/0xc0(%%r12,%%r9) as we do in lcol, but 1 cycle faster with it. [!?] */\
		"movaps	%%xmm5,    (%%r13)	\n\t	addpd		%%xmm10,%%xmm13	\n\t"\
		"movaps	%%xmm7,0x10(%%r10)	\n\t	addpd		%%xmm9 ,%%xmm15	\n\t"\
		"movaps	%%xmm4,0x10(%%r11)	\n\t	addpd		%%xmm11,%%xmm12	\n\t"\
		"									movaps	%%xmm14,    (%%r10,%%r9)	\n\t"\
		"									movaps	%%xmm13,    (%%r13,%%r9)	\n\t"\
		"									movaps	%%xmm15,0x10(%%r10,%%r9)	\n\t"\
		"									movaps	%%xmm12,0x10(%%r11,%%r9)	\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__in1] "m" (Xin1)\
		 ,[__in2] "m" (Xin2)\
		 ,[__in3] "m" (Xin3)\
		/* idiff, 'input-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's inputs */\
		 ,[__idiff] "m" (Xidiff)\
		 ,[__two] "m" (Xtwo)	/* pointer to vector-const 2.0 */\
		 ,[__cc0] "m" (Xcc0)\
		/* roff, 'roots-address offset', has literal-bytewise address offset between ptrs to 1st,2nd DFT's twiddles */\
		 ,[__roff] "e" (Xroff)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		/* odiff, 'output-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's outputs */\
		 ,[__odiff] "m" (Xodiff)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	/* Following the convention in the above DIF version of this macro, 1st two roots of unity swapped:
		[c,s](1,2,3) = cc0+[(0x20,0x30),(0x00,0x10),(0x40,0x50)]
	*/
	#define SSE2_RADIX_04_DIT_3TWIDDLE_X1(Xin0,Xin1,Xin2,Xin3,Xidiff, Xtwo,Xcc0,Xroff, Xout0,Xout1,Xout2,Xout3,Xodiff)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%rsi 		\n\t	movslq	%[__idiff],%%rdi	\n\t"\
		"movq	%[__in0],%%rax		\n\t	leaq %c[__roff](%%rsi),%%r8	\n\t"\
		"movq	%[__in1],%%rbx		\n\t	movslq	%[__odiff],%%r9		\n\t"\
		"movq	%[__in2],%%rcx		\n\t	movq	%[__out0],%%r10		\n\t"\
		"movq	%[__in3],%%rdx		\n\t	movq	%[__out1],%%r11		\n\t"\
	/*	"movaps	    (%%rax),%%xmm0	\n\t*/"	movq	%[__out2],%%r12		\n\t"\
	/*	"movaps	0x10(%%rax),%%xmm1	\n\t*/"	movq	%[__out3],%%r13		\n\t"\
		"movaps	(%%rsi),%%xmm8	\n\t"/* two */\
		/*
		tr1 = Ar0 - Ar1;			tr0 = Ar0 + Ar1;
		ti1 = Ai0 - Ai1;			ti0 = Ai0 + Ai1;
		tr3 = Ar2 - Ar3;			tr2 = Ar2 + Ar3;
		ti3 = Ai2 - Ai3;			ti2 = Ai2 + Ai3;
		*/\
		"movaps	    (%%rax),%%xmm0	\n\t"/* Ar0 */\
		"movaps	0x10(%%rax),%%xmm1	\n\t"/* Ai0 */\
		"movaps	    (%%rbx),%%xmm2	\n\t"/* Ar1 */\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"/* Ai1 */\
		"movaps	    (%%rcx),%%xmm4	\n\t"/* Ar2 */\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"/* Ai2 */\
		"movaps	    (%%rdx),%%xmm6	\n\t"/* Ar3 */\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"/* Ai3 */\
		"subpd	%%xmm2,%%xmm0		\n\t"/* tr1 = Ar0 - Ar1 */\
		"subpd	%%xmm3,%%xmm1		\n\t"/* ti1 = Ai0 - Ai1 */\
		"subpd	%%xmm6,%%xmm4		\n\t"/* tr3 = Ar2 - Ar3 */\
		"subpd	%%xmm7,%%xmm5		\n\t"/* ti3 = Ai2 - Ai3 */\
		"mulpd	%%xmm8,%%xmm2		\n\t"/* 2*Ar1 */\
		"mulpd	%%xmm8,%%xmm3		\n\t"/* 2*Ai1 */\
		"mulpd	%%xmm8,%%xmm6		\n\t"/* 2*Ar3 */\
		"mulpd	%%xmm8,%%xmm7		\n\t"/* 2*Ai3 */\
		"addpd	%%xmm0,%%xmm2		\n\t"/* tr0 = Ar0 + Ar1 */\
		"addpd	%%xmm1,%%xmm3		\n\t"/* ti0 = Ai0 + Ai1 */\
		"addpd	%%xmm4,%%xmm6		\n\t"/* tr2 = Ar2 + Ar3 */\
		"addpd	%%xmm5,%%xmm7		\n\t"/* ti2 = Ai2 + Ai3 */\
	/*
	Br0 = tr0 + tr2;			Bi0 = ti0 + ti2;
	tr0 = tr0 - tr2;			ti0 = ti0 - ti2;
	r = tr3;tr3 = tr1 - ti3;	tr1 = tr1 + ti3;
			ti3 = ti1 + r  ;	ti1 = ti1 - r  ;
	*/\
		"movq	%[__cc0],%%rsi 		\n\t"/* Sincos data base-pointer */\
		"subpd	%%xmm6,%%xmm2		\n\t"/* tr0 = tr0 - tr2 */\
		"subpd	%%xmm7,%%xmm3		\n\t"/* ti0 = ti0 - ti2 */\
		"subpd	%%xmm5,%%xmm0		\n\t"/* tr3 = tr1 - ti3 */\
		"subpd	%%xmm4,%%xmm1		\n\t"/* ti1 = ti1 - tr3 */\
		"mulpd	%%xmm8,%%xmm6		\n\t"/* 2*tr2 */\
		"mulpd	%%xmm8,%%xmm7		\n\t"/* 2*ti2 */\
		"mulpd	%%xmm8,%%xmm5		\n\t"/* 2*ti3 */\
		"mulpd	%%xmm8,%%xmm4		\n\t"/* 2*tr3 */\
		"addpd	%%xmm2,%%xmm6		\n\t"/* Br0 = tr0 + tr2 */\
		"addpd	%%xmm3,%%xmm7		\n\t"/* Bi0 = ti0 + ti2 */\
		"addpd	%%xmm0,%%xmm5		\n\t"/* tr1 = tr1 + ti3 */\
		"addpd	%%xmm1,%%xmm4		\n\t"/* ti3 = ti1 + tr3 */\
		"movaps	%%xmm6,    (%%r10)	\n\t"/* Write Br0 */\
		"movaps	%%xmm7,0x10(%%r10)	\n\t"/* Write Bi0 */\
	/* Br2 = tr0*c2 + ti0*s2;	Bi2 = ti0*c2 - tr0*s2;	// twiddle = ~w2 = c2-I.s1 */\
		"movaps	%%xmm2,%%xmm6		\n\t"/* cpy tr0 */\
		"movaps	%%xmm3,%%xmm7		\n\t"/* cpy ti0 */\
		"mulpd	    (%%rsi),%%xmm6	\n\t"/* c2*tr0 */\
		"mulpd	0x10(%%rsi),%%xmm3	\n\t"/* s2*ti0 */\
		"mulpd	    (%%rsi),%%xmm7	\n\t"/* c2*ti0 */\
		"mulpd	0x10(%%rsi),%%xmm2	\n\t"/* s2*tr0 */\
		"addpd	%%xmm3,%%xmm6		\n\t"/* Br2 = tr0*c2 + ti0*s2 */\
		"subpd	%%xmm2,%%xmm7		\n\t"/* Bi2 = ti0*c2 - tr0*s2 */\
		"movaps	%%xmm6,    (%%r12)	\n\t"/* Write Br2 */\
		"movaps	%%xmm7,0x10(%%r12)	\n\t"/* Write Bi2 */\
	/* Br1 = tr1*c1 + ti1*s1;	Bi1 = ti1*c1 - tr1*s1;	// twiddle = ~w1 = c1-I.s1 */\
		"movaps	%%xmm5,%%xmm6		\n\t"/* cpy tr1 */\
		"movaps	%%xmm1,%%xmm7		\n\t"/* cpy ti1 */\
		"mulpd	0x20(%%rsi),%%xmm6	\n\t"/* c1*tr1 */\
		"mulpd	0x30(%%rsi),%%xmm1	\n\t"/* s1*ti1 */\
		"mulpd	0x20(%%rsi),%%xmm7	\n\t"/* c1*ti1 */\
		"mulpd	0x30(%%rsi),%%xmm5	\n\t"/* s1*tr1 */\
		"addpd	%%xmm1,%%xmm6		\n\t"/* Br1 = tr1*c1 + ti1*s1 */\
		"subpd	%%xmm5,%%xmm7		\n\t"/* Bi1 = ti1*c1 - tr1*s1 */\
		"movaps	%%xmm6,    (%%r11)	\n\t"/* Write Br1 */\
		"movaps	%%xmm7,0x10(%%r11)	\n\t"/* Write Bi1 */\
	/* Br3 = tr3*c3 + ti3*s3;	Bi3 = ti3*c3 - tr3*s3;	// twiddle = ~w3 = c3-I.s3 */\
		"movaps	%%xmm0,%%xmm6		\n\t"/* cpy tr3 */\
		"movaps	%%xmm4,%%xmm7		\n\t"/* cpy ti3 */\
		"mulpd	0x40(%%rsi),%%xmm6	\n\t"/* c3*tr3 */\
		"mulpd	0x50(%%rsi),%%xmm4	\n\t"/* s3*ti3 */\
		"mulpd	0x40(%%rsi),%%xmm7	\n\t"/* c3*ti3 */\
		"mulpd	0x50(%%rsi),%%xmm0	\n\t"/* s3*tr3 */\
		"addpd	%%xmm4,%%xmm6		\n\t"/* Br3 = tr3*c3 + ti3*s3 */\
		"subpd	%%xmm0,%%xmm7		\n\t"/* Bi3 = ti3*c3 - tr3*s3 */\
		"movaps	%%xmm6,    (%%r13)	\n\t"/* Write Br3 */\
		"movaps	%%xmm7,0x10(%%r13)	\n\t"/* Write Bi3 */\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__in1] "m" (Xin1)\
		 ,[__in2] "m" (Xin2)\
		 ,[__in3] "m" (Xin3)\
		/* idiff, 'input-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's inputs */\
		 ,[__idiff] "m" (Xidiff)\
		 ,[__two] "m" (Xtwo)	/* pointer to vector-const 2.0 */\
		 ,[__cc0] "m" (Xcc0)\
		/* roff, 'roots-address offset', has literal-bytewise address offset between ptrs to 1st,2nd DFT's twiddles */\
		 ,[__roff] "e" (Xroff)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		/* odiff, 'output-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's outputs */\
		 ,[__odiff] "m" (Xodiff)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_04_DIT_3TWIDDLE_X2(Xin0,Xin1,Xin2,Xin3,Xidiff, Xtwo,Xcc0,Xroff, Xout0,Xout1,Xout2,Xout3,Xodiff)\
	{\
	__asm__ volatile (\
		"movq	%[__two],%%rsi 		\n\t"\
		"movq	%[__in0],%%rax		\n\t"\
		"movq	%[__in1],%%rbx		\n\t"\
		"movq	%[__in2],%%rcx		\n\t"\
		"movq	%[__in3],%%rdx		\n\t"\
		"movslq	%[__idiff],%%rdi	\n\t"\
		"movaps	(%%rsi),%%xmm15	\n\t"/* two */\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t	leaq	0x10(%%rdx,%%rdi),%%r10		\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t	movaps	    (%%rax,%%rdi),%%xmm8 	\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t	movaps	0x10(%%rax,%%rdi),%%xmm9 	\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t	movaps	    (%%rbx,%%rdi),%%xmm10	\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t	movaps	0x10(%%rbx,%%rdi),%%xmm11	\n\t"\
		"mulpd	%%xmm15,%%xmm2		\n\t	movaps	    (%%rcx,%%rdi),%%xmm12	\n\t"\
		"mulpd	%%xmm15,%%xmm3		\n\t	movaps	0x10(%%rcx,%%rdi),%%xmm13	\n\t"\
		"mulpd	%%xmm15,%%xmm6		\n\t	movaps	    (%%rdx,%%rdi),%%xmm14	\n\t"\
		"mulpd	%%xmm15,%%xmm7		\n\t"/*	movaps	0x10(%%rdx,%%rdi),%%xmm15	\n\t"*/\
		"addpd	%%xmm0,%%xmm2		\n\t	subpd	%%xmm10,%%xmm8 		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t	subpd	%%xmm11,%%xmm9 		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t	subpd	%%xmm14,%%xmm12		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t	subpd	(%%r10),%%xmm13		\n\t"\
		"subpd	%%xmm6,%%xmm2		\n\t	mulpd	%%xmm15,%%xmm10		\n\t"\
		"subpd	%%xmm7,%%xmm3		\n\t	mulpd	%%xmm15,%%xmm11		\n\t"\
		"subpd	%%xmm5,%%xmm0		\n\t	mulpd	%%xmm15,%%xmm14		\n\t	movslq	%[__odiff],%%r9		\n\t"\
		"subpd	%%xmm4,%%xmm1		\n\t	mulpd	(%%r10),%%xmm15		\n\t	movq	%[__out0],%%r10		\n\t"\
		"mulpd	(%%rsi),%%xmm6		\n\t	addpd	%%xmm8 ,%%xmm10		\n\t	movq	%[__out1],%%r11		\n\t"\
		"mulpd	(%%rsi),%%xmm7		\n\t	addpd	%%xmm9 ,%%xmm11		\n\t	movq	%[__out2],%%r12		\n\t"\
		"mulpd	(%%rsi),%%xmm5		\n\t	addpd	%%xmm12,%%xmm14		\n\t	movq	%[__out3],%%r13		\n\t"\
		"mulpd	(%%rsi),%%xmm4		\n\t	addpd	%%xmm13,%%xmm15		\n\t"\
		"addpd	%%xmm2,%%xmm6		\n\t	subpd	%%xmm14,%%xmm10		\n\t"\
		"addpd	%%xmm3,%%xmm7		\n\t	subpd	%%xmm15,%%xmm11		\n\t"\
		"addpd	%%xmm0,%%xmm5		\n\t	subpd	%%xmm13,%%xmm8 		\n\t"\
		"addpd	%%xmm1,%%xmm4		\n\t	subpd	%%xmm12,%%xmm9 		\n\t"\
		"movaps	%%xmm6,    (%%r10)	\n\t	movaps	%%xmm7,0x10(%%r10)	\n\t"/* Write B0 to free up 2 regs */\
		"movaps	(%%rsi),%%xmm6		\n\t	mulpd	%%xmm6 ,%%xmm14		\n\t"\
		"									mulpd	%%xmm6 ,%%xmm15		\n\t"\
		"									mulpd	%%xmm6 ,%%xmm13		\n\t"\
		/* B2 = t0*~w2 = t0*[c2-I.s1] */"	mulpd	%%xmm6 ,%%xmm12		\n\t	movq	%[__cc0],%%rsi 		\n\t"\
		"movaps	%%xmm2,%%xmm6		\n\t	addpd	%%xmm10,%%xmm14		\n\t"\
		"movaps	%%xmm3,%%xmm7		\n\t	addpd	%%xmm11,%%xmm15		\n\t"\
		"mulpd	    (%%rsi),%%xmm6	\n\t	addpd	%%xmm8 ,%%xmm13		\n\t"\
		"mulpd	0x10(%%rsi),%%xmm3	\n\t	addpd	%%xmm9 ,%%xmm12		\n\t	leaq %c[__roff](%%rsi),%%r8	\n\t"\
		"mulpd	    (%%rsi),%%xmm7	\n\t	movaps %%xmm14,    (%%r10,%%r9)	\n\t"\
		"mulpd	0x10(%%rsi),%%xmm2	\n\t	movaps %%xmm15,0x10(%%r10,%%r9)	\n\t"\
		"addpd	%%xmm3,%%xmm6		\n\t	movaps	%%xmm10,%%xmm14		\n\t"\
		"subpd	%%xmm2,%%xmm7		\n\t	movaps	%%xmm11,%%xmm15		\n\t"\
		"movaps	%%xmm6,    (%%r12)	\n\t	mulpd	    (%%r8),%%xmm14	\n\t"\
		"movaps	%%xmm7,0x10(%%r12)	\n\t	mulpd	0x10(%%r8),%%xmm11	\n\t"\
		/* B1 = t1*~w1 = t1*[c1-I.s1] */"	mulpd	    (%%r8),%%xmm15	\n\t"\
		"movaps	0x20(%%rsi),%%xmm2	\n\t	mulpd	0x10(%%r8),%%xmm10	\n\t"\
		"movaps	0x30(%%rsi),%%xmm3	\n\t	addpd	%%xmm11,%%xmm14		\n\t"\
		"movaps	%%xmm5,%%xmm6		\n\t	subpd	%%xmm10,%%xmm15		\n\t"\
		"movaps	%%xmm1,%%xmm7		\n\t	movaps %%xmm14,    (%%r12,%%r9)	\n\t"\
		"mulpd	%%xmm2,%%xmm6		\n\t	movaps %%xmm15,0x10(%%r12,%%r9)	\n\t"\
		"mulpd	%%xmm3,%%xmm1		\n\t	movaps	%%xmm13,%%xmm14		\n\t"\
		"mulpd	%%xmm2,%%xmm7		\n\t	movaps	%%xmm9 ,%%xmm15		\n\t"\
		"mulpd	%%xmm3,%%xmm5		\n\t	mulpd	0x20(%%r8),%%xmm14	\n\t"\
		"movaps	0x40(%%rsi),%%xmm2	\n\t	mulpd	0x30(%%r8),%%xmm9 	\n\t"\
		"movaps	0x50(%%rsi),%%xmm3	\n\t	mulpd	0x20(%%r8),%%xmm15	\n\t"\
		"addpd	%%xmm1,%%xmm6		\n\t	mulpd	0x30(%%r8),%%xmm13	\n\t"\
		"subpd	%%xmm5,%%xmm7		\n\t	addpd	%%xmm9 ,%%xmm14		\n\t"\
		"movaps	%%xmm6,    (%%r11)	\n\t	subpd	%%xmm13,%%xmm15		\n\t"\
		"movaps	%%xmm7,0x10(%%r11)	\n\t	movaps %%xmm14,    (%%r11,%%r9)	\n\t"\
		/* B3 = t3*~w3 = t3*[c3-I.s3] */"	movaps %%xmm15,0x10(%%r11,%%r9)	\n\t"\
		"movaps	%%xmm0,%%xmm6		\n\t	movaps	%%xmm8 ,%%xmm14		\n\t"\
		"movaps	%%xmm4,%%xmm7		\n\t	movaps	%%xmm12,%%xmm15		\n\t"\
		"mulpd	%%xmm2,%%xmm6		\n\t	mulpd	0x40(%%r8),%%xmm14	\n\t"\
		"mulpd	%%xmm3,%%xmm4		\n\t	mulpd	0x50(%%r8),%%xmm12	\n\t"\
		"mulpd	%%xmm2,%%xmm7		\n\t	mulpd	0x40(%%r8),%%xmm15	\n\t"\
		"mulpd	%%xmm3,%%xmm0		\n\t	mulpd	0x50(%%r8),%%xmm8 	\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t	addpd	%%xmm12,%%xmm14		\n\t"\
		"subpd	%%xmm0,%%xmm7		\n\t	subpd	%%xmm8 ,%%xmm15		\n\t"\
		"movaps	%%xmm6,    (%%r13)	\n\t	movaps	%%xmm14,    (%%r13,%%r9)	\n\t"\
		"movaps	%%xmm7,0x10(%%r13)	\n\t	movaps	%%xmm15,0x10(%%r13,%%r9)	\n\t"\
		:					/* outputs: none */\
		: [__in0] "m" (Xin0)	/* All inputs from memory addresses here */\
		 ,[__in1] "m" (Xin1)\
		 ,[__in2] "m" (Xin2)\
		 ,[__in3] "m" (Xin3)\
		/* idiff, 'input-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's inputs */\
		 ,[__idiff] "m" (Xidiff)\
		 ,[__two] "m" (Xtwo)	/* pointer to vector-const 2.0 */\
		 ,[__cc0] "m" (Xcc0)\
		/* roff, 'roots-address offset', has literal-bytewise address offset between ptrs to 1st,2nd DFT's twiddles */\
		 ,[__roff] "e" (Xroff)\
		 ,[__out0] "m" (Xout0)\
		 ,[__out1] "m" (Xout1)\
		 ,[__out2] "m" (Xout2)\
		 ,[__out3] "m" (Xout3)\
		/* odiff, 'output-address difference', has variable-stored bytewise address offset between ptrs to 1st,2nd DFT's outputs */\
		 ,[__odiff] "m" (Xodiff)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r8","r9","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	#define SSE2_RADIX_05_DFT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4, Xcc1, Xo0,Xo1,Xo2,Xo3,Xo4)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rsi		\n\t"\
		"movq	%[__i1],%%rax		\n\t"\
		"movq	%[__i2],%%rbx		\n\t"\
		"movq	%[__i3],%%rcx		\n\t"\
		"movq	%[__i4],%%rdx		\n\t"\
		"movq	%[__o0],%%rdi		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm2,%%xmm4		\n\t"\
		"addpd	%%xmm3,%%xmm5		\n\t"\
	"movq	%[__cc1],%%rax		\n\t"\
		"subpd	%%xmm4,%%xmm6		\n\t"\
		"subpd	%%xmm5,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm4		\n\t"\
		"addpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	    (%%rsi),%%xmm4	\n\t"\
		"addpd	0x10(%%rsi),%%xmm5	\n\t"\
		"movaps	%%xmm4,    (%%rdi)	\n\t"\
		"movaps	%%xmm5,0x10(%%rdi)	\n\t"\
		"mulpd	0x10(%%rax),%%xmm6	\n\t"\
		"mulpd	0x10(%%rax),%%xmm7	\n\t"\
		"subpd	     (%%rsi),%%xmm4	\n\t"\
		"subpd	0x010(%%rsi),%%xmm5	\n\t"\
		"mulpd	    (%%rax),%%xmm4	\n\t"\
		"mulpd	    (%%rax),%%xmm5	\n\t"\
		"addpd	     (%%rdi),%%xmm4	\n\t"\
		"addpd	0x010(%%rdi),%%xmm5	\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t"\
		"movaps	%%xmm4,    (%%rsi)	\n\t"\
		"movaps	%%xmm5,0x10(%%rsi)	\n\t"\
		"movaps	%%xmm0,%%xmm4		\n\t"\
		"movaps	%%xmm1,%%xmm5		\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t"\
		"mulpd	0x20(%%rax),%%xmm0	\n\t"\
		"mulpd	0x20(%%rax),%%xmm1	\n\t"\
		"mulpd	0x30(%%rax),%%xmm2	\n\t"\
		"mulpd	0x30(%%rax),%%xmm3	\n\t"\
		"mulpd	0x40(%%rax),%%xmm4	\n\t"\
		"mulpd	0x40(%%rax),%%xmm5	\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t"\
		"movaps	    (%%rsi),%%xmm4	\n\t"\
		"movaps	0x10(%%rsi),%%xmm5	\n\t"\
		"movq	%[__o1],%%rax		\n\t"\
		"movq	%[__o4],%%rdx		\n\t"\
		"subpd	%%xmm3,%%xmm6		\n\t"\
		"subpd	%%xmm2,%%xmm7		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t"\
		"movaps	%%xmm7,0x10(%%rdx)	\n\t"\
		"addpd	%%xmm6,%%xmm3		\n\t"\
		"addpd	%%xmm7,%%xmm2		\n\t"\
		"movaps	%%xmm3,    (%%rdx)	\n\t"\
		"movaps	%%xmm2,0x10(%%rax)	\n\t"\
		"movq	%[__o2],%%rbx		\n\t"\
		"movq	%[__o3],%%rcx		\n\t"\
		"subpd	%%xmm1,%%xmm4		\n\t"\
		"subpd	%%xmm0,%%xmm5		\n\t"\
		"addpd	%%xmm1,%%xmm1		\n\t"\
		"addpd	%%xmm0,%%xmm0		\n\t"\
		"movaps	%%xmm4,    (%%rbx)	\n\t"\
		"movaps	%%xmm5,0x10(%%rcx)	\n\t"\
		"addpd	%%xmm4,%%xmm1		\n\t"\
		"addpd	%%xmm5,%%xmm0		\n\t"\
		"movaps	%%xmm1,    (%%rcx)	\n\t"\
		"movaps	%%xmm0,0x10(%%rbx)	\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__cc1] "m" (Xcc1)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7"		/* Clobbered registers */\
	);\
	}

	/* 16-xmm-register version does 2 of the above side-by-side: */
	#define SSE2_RADIX_05_DFT_0TWIDDLE_X2(Xcc1,Xtwo, Xi0,Xi1,Xi2,Xi3,Xi4, Xo0,Xo1,Xo2,Xo3,Xo4, Xj0,Xj1,Xj2,Xj3,Xj4, Xu0,Xu1,Xu2,Xu3,Xu4)\
	{\
	__asm__ volatile (\
		"movq	%[__i0],%%rsi		\n\t	movq	%[__j0],%%r10		\n\t"\
		"movq	%[__i1],%%rax		\n\t	movq	%[__j1],%%r11		\n\t"\
		"movq	%[__i2],%%rbx		\n\t	movq	%[__j2],%%r12		\n\t"\
		"movq	%[__i3],%%rcx		\n\t	movq	%[__j3],%%r13		\n\t"\
		"movq	%[__i4],%%rdx		\n\t	movq	%[__j4],%%r14		\n\t"\
		"movq	%[__o0],%%rdi		\n\t	movq	%[__u0],%%r15		\n\t"\
		"movaps	    (%%rax),%%xmm0	\n\t	movaps	    (%%r11),%%xmm8 	\n\t"\
		"movaps	0x10(%%rax),%%xmm1	\n\t	movaps	0x10(%%r11),%%xmm9 	\n\t"\
		"movaps	    (%%rbx),%%xmm2	\n\t	movaps	    (%%r12),%%xmm10	\n\t"\
		"movaps	0x10(%%rbx),%%xmm3	\n\t	movaps	0x10(%%r12),%%xmm11	\n\t"\
		"movaps	    (%%rcx),%%xmm4	\n\t	movaps	    (%%r13),%%xmm12	\n\t"\
		"movaps	0x10(%%rcx),%%xmm5	\n\t	movaps	0x10(%%r13),%%xmm13	\n\t"\
		"movaps	    (%%rdx),%%xmm6	\n\t	movaps	    (%%r14),%%xmm14	\n\t"\
		"movaps	0x10(%%rdx),%%xmm7	\n\t	movaps	0x10(%%r14),%%xmm15	\n\t"\
		"subpd	%%xmm6,%%xmm0		\n\t	subpd	%%xmm14,%%xmm8 		\n\t"\
		"subpd	%%xmm7,%%xmm1		\n\t	subpd	%%xmm15,%%xmm9 		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t	addpd	%%xmm14,%%xmm14		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t	addpd	%%xmm15,%%xmm15		\n\t"\
		"addpd	%%xmm0,%%xmm6		\n\t	addpd	%%xmm8 ,%%xmm14		\n\t"\
		"addpd	%%xmm1,%%xmm7		\n\t	addpd	%%xmm9 ,%%xmm15		\n\t"\
		"subpd	%%xmm4,%%xmm2		\n\t	subpd	%%xmm12,%%xmm10		\n\t"\
		"subpd	%%xmm5,%%xmm3		\n\t	subpd	%%xmm13,%%xmm11		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t	addpd	%%xmm12,%%xmm12		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t	addpd	%%xmm13,%%xmm13		\n\t"\
		"addpd	%%xmm2,%%xmm4		\n\t	addpd	%%xmm10,%%xmm12		\n\t"\
		"addpd	%%xmm3,%%xmm5		\n\t	addpd	%%xmm11,%%xmm13		\n\t"\
		"movq	%[__cc1],%%rax		\n\t"\
		"subpd	%%xmm4,%%xmm6		\n\t	subpd	%%xmm12,%%xmm14		\n\t"\
		"subpd	%%xmm5,%%xmm7		\n\t	subpd	%%xmm13,%%xmm15		\n\t"\
		"addpd	%%xmm4,%%xmm4		\n\t	addpd	%%xmm12,%%xmm12		\n\t"\
		"addpd	%%xmm5,%%xmm5		\n\t	addpd	%%xmm13,%%xmm13		\n\t"\
		"addpd	%%xmm6,%%xmm4		\n\t	addpd	%%xmm14,%%xmm12		\n\t"\
		"addpd	%%xmm7,%%xmm5		\n\t	addpd	%%xmm15,%%xmm13		\n\t"\
		"addpd	    (%%rsi),%%xmm4	\n\t	addpd	    (%%r10),%%xmm12	\n\t"\
		"addpd	0x10(%%rsi),%%xmm5	\n\t	addpd	0x10(%%r10),%%xmm13	\n\t"\
		"movaps	%%xmm4,    (%%rdi)	\n\t	movaps	%%xmm12,    (%%r15)	\n\t"\
		"movaps	%%xmm5,0x10(%%rdi)	\n\t	movaps	%%xmm13,0x10(%%r15)	\n\t"\
		"mulpd	0x10(%%rax),%%xmm6	\n\t	mulpd	0x10(%%rax),%%xmm14	\n\t"\
		"mulpd	0x10(%%rax),%%xmm7	\n\t	mulpd	0x10(%%rax),%%xmm15	\n\t"\
		"subpd	    (%%rsi),%%xmm4	\n\t	subpd	    (%%r10),%%xmm12	\n\t"\
		"subpd	0x10(%%rsi),%%xmm5	\n\t	subpd	0x10(%%r10),%%xmm13	\n\t"\
		"mulpd	    (%%rax),%%xmm4	\n\t	mulpd	    (%%rax),%%xmm12	\n\t"\
		"mulpd	    (%%rax),%%xmm5	\n\t	mulpd	    (%%rax),%%xmm13	\n\t"\
		"addpd	    (%%rdi),%%xmm4	\n\t	addpd	    (%%r15),%%xmm12	\n\t"\
		"addpd	0x10(%%rdi),%%xmm5	\n\t	addpd	0x10(%%r15),%%xmm13	\n\t"\
		"subpd	%%xmm6,%%xmm4		\n\t	subpd	%%xmm14,%%xmm12		\n\t"\
		"subpd	%%xmm7,%%xmm5		\n\t	subpd	%%xmm15,%%xmm13		\n\t"\
		"addpd	%%xmm6,%%xmm6		\n\t	addpd	%%xmm14,%%xmm14		\n\t"\
		"addpd	%%xmm7,%%xmm7		\n\t	addpd	%%xmm15,%%xmm15		\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t	addpd	%%xmm12,%%xmm14		\n\t"\
		"addpd	%%xmm5,%%xmm7		\n\t	addpd	%%xmm13,%%xmm15		\n\t"\
		"movaps	%%xmm4,    (%%rsi)	\n\t	movaps	%%xmm12,    (%%r10)	\n\t"\
		"movaps	%%xmm5,0x10(%%rsi)	\n\t	movaps	%%xmm13,0x10(%%r10)	\n\t"\
		"movaps	%%xmm0,%%xmm4		\n\t	movaps	%%xmm8 ,%%xmm12		\n\t"\
		"movaps	%%xmm1,%%xmm5		\n\t	movaps	%%xmm9 ,%%xmm13		\n\t"\
		"subpd	%%xmm2,%%xmm0		\n\t	subpd	%%xmm10,%%xmm8 		\n\t"\
		"subpd	%%xmm3,%%xmm1		\n\t	subpd	%%xmm11,%%xmm9 		\n\t"\
		"mulpd	0x20(%%rax),%%xmm0	\n\t	mulpd	0x20(%%rax),%%xmm8 	\n\t"\
		"mulpd	0x20(%%rax),%%xmm1	\n\t	mulpd	0x20(%%rax),%%xmm9 	\n\t"\
		"mulpd	0x30(%%rax),%%xmm2	\n\t	mulpd	0x30(%%rax),%%xmm10	\n\t"\
		"mulpd	0x30(%%rax),%%xmm3	\n\t	mulpd	0x30(%%rax),%%xmm11	\n\t"\
		"mulpd	0x40(%%rax),%%xmm4	\n\t	mulpd	0x40(%%rax),%%xmm12	\n\t"\
		"mulpd	0x40(%%rax),%%xmm5	\n\t	mulpd	0x40(%%rax),%%xmm13	\n\t"\
		"addpd	%%xmm0,%%xmm2		\n\t	addpd	%%xmm8 ,%%xmm10		\n\t"\
		"addpd	%%xmm1,%%xmm3		\n\t	addpd	%%xmm9 ,%%xmm11		\n\t"\
		"subpd	%%xmm4,%%xmm0		\n\t	subpd	%%xmm12,%%xmm8 		\n\t"\
		"subpd	%%xmm5,%%xmm1		\n\t	subpd	%%xmm13,%%xmm9 		\n\t"\
		"movaps	    (%%rsi),%%xmm4	\n\t	movaps	    (%%r10),%%xmm12	\n\t"\
		"movaps	0x10(%%rsi),%%xmm5	\n\t	movaps	0x10(%%r10),%%xmm13	\n\t"\
		"movq	%[__o1],%%rax		\n\t	movq	%[__u1],%%r11		\n\t"\
		"movq	%[__o4],%%rdx		\n\t	movq	%[__u4],%%r14		\n\t"\
		"subpd	%%xmm3,%%xmm6		\n\t	subpd	%%xmm11,%%xmm14		\n\t"\
		"subpd	%%xmm2,%%xmm7		\n\t	subpd	%%xmm10,%%xmm15		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t	addpd	%%xmm11,%%xmm11		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t	addpd	%%xmm10,%%xmm10		\n\t"\
		"movaps	%%xmm6,    (%%rax)	\n\t	movaps	%%xmm14,    (%%r11)	\n\t"\
		"movaps	%%xmm7,0x10(%%rdx)	\n\t	movaps	%%xmm15,0x10(%%r14)	\n\t"\
		"addpd	%%xmm6,%%xmm3		\n\t	addpd	%%xmm14,%%xmm11		\n\t"\
		"addpd	%%xmm7,%%xmm2		\n\t	addpd	%%xmm15,%%xmm10		\n\t"\
		"movaps	%%xmm3,    (%%rdx)	\n\t	movaps	%%xmm11,    (%%r14)	\n\t"\
		"movaps	%%xmm2,0x10(%%rax)	\n\t	movaps	%%xmm10,0x10(%%r11)	\n\t"\
		"movq	%[__o2],%%rbx		\n\t	movq	%[__u2],%%r12		\n\t"\
		"movq	%[__o3],%%rcx		\n\t	movq	%[__u3],%%r13		\n\t"\
		"subpd	%%xmm1,%%xmm4		\n\t	subpd	%%xmm9 ,%%xmm12		\n\t"\
		"subpd	%%xmm0,%%xmm5		\n\t	subpd	%%xmm8 ,%%xmm13		\n\t"\
		"addpd	%%xmm1,%%xmm1		\n\t	addpd	%%xmm9 ,%%xmm9 		\n\t"\
		"addpd	%%xmm0,%%xmm0		\n\t	addpd	%%xmm8 ,%%xmm8 		\n\t"\
		"movaps	%%xmm4,    (%%rbx)	\n\t	movaps	%%xmm12,    (%%r12)	\n\t"\
		"movaps	%%xmm5,0x10(%%rcx)	\n\t	movaps	%%xmm13,0x10(%%r13)	\n\t"\
		"addpd	%%xmm4,%%xmm1		\n\t	addpd	%%xmm12,%%xmm9 		\n\t"\
		"addpd	%%xmm5,%%xmm0		\n\t	addpd	%%xmm13,%%xmm8 		\n\t"\
		"movaps	%%xmm1,    (%%rcx)	\n\t	movaps	%%xmm9 ,    (%%r13)	\n\t"\
		"movaps	%%xmm0,0x10(%%rbx)	\n\t	movaps	%%xmm8 ,0x10(%%r12)	\n\t"\
		:					/* outputs: none */\
		: [__cc1] "m" (Xcc1)	/* All inputs from memory addresses here */\
		 ,[__two] "m" (Xtwo)\
		 ,[__i0] "m" (Xi0)\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__j0] "m" (Xj0)\
		 ,[__j1] "m" (Xj1)\
		 ,[__j2] "m" (Xj2)\
		 ,[__j3] "m" (Xj3)\
		 ,[__j4] "m" (Xj4)\
		 ,[__u0] "m" (Xu0)\
		 ,[__u1] "m" (Xu1)\
		 ,[__u2] "m" (Xu2)\
		 ,[__u3] "m" (Xu3)\
		 ,[__u4] "m" (Xu4)\
		: "cc","memory","rax","rbx","rcx","rdx","rdi","rsi","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	/*...Radix-7 DFT: Inputs in memlocs __i0-6, outputs into __o0-6, possibly coincident with inputs:\ */\
	#define SSE2_RADIX_07_DFT(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6, Xcc, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6)\
	{\
	__asm__ volatile (\
		"movq	%[__i1],%%rax		\n\t"\
		"movq	%[__i2],%%rbx		\n\t"\
		"movq	%[__i3],%%rcx		\n\t"\
		"movq	%[__i4],%%rdx		\n\t"\
		"movq	%[__i5],%%rsi		\n\t"\
		"movq	%[__i6],%%rdi		\n\t	/*** Imaginary Parts: ***/	\n\t"\
		"movaps	(%%rax),%%xmm6		\n\t	movaps	0x10(%%rax),%%xmm14	\n\t"\
		"movaps	(%%rdi),%%xmm1		\n\t	movaps	0x10(%%rdi),%%xmm9 	\n\t"\
		"movaps	(%%rbx),%%xmm5		\n\t	movaps	0x10(%%rbx),%%xmm13	\n\t"\
		"movaps	(%%rsi),%%xmm2		\n\t	movaps	0x10(%%rsi),%%xmm10	\n\t"\
		"movaps	(%%rcx),%%xmm4		\n\t	movaps	0x10(%%rcx),%%xmm12	\n\t"\
		"movaps	(%%rdx),%%xmm3		\n\t	movaps	0x10(%%rdx),%%xmm11	\n\t"\
		"movq	%[__i0],%%rbx		\n\t"\
		"subpd	%%xmm1,%%xmm6		\n\t	subpd	%%xmm9 ,%%xmm14		\n\t"\
		"addpd	%%xmm1,%%xmm1		\n\t	addpd	%%xmm9 ,%%xmm9 		\n\t"\
		"addpd	%%xmm6,%%xmm1		\n\t	addpd	%%xmm14,%%xmm9  	\n\t"\
		"subpd	%%xmm2,%%xmm5		\n\t	subpd	%%xmm10,%%xmm13		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t	addpd	%%xmm10,%%xmm10		\n\t"\
		"addpd	%%xmm5,%%xmm2		\n\t	addpd	%%xmm13,%%xmm10		\n\t"\
		"movaps	(%%rbx),%%xmm0		\n\t	movaps	0x10(%%rbx),%%xmm8 	\n\t"\
		"subpd	%%xmm3,%%xmm4		\n\t	subpd	%%xmm11,%%xmm12		\n\t"\
		"addpd	%%xmm3,%%xmm3		\n\t	addpd	%%xmm11,%%xmm11		\n\t"\
		"addpd	%%xmm4,%%xmm3		\n\t	addpd	%%xmm12,%%xmm11		\n\t"\
		"\n\t"\
		"movq	%[__o0],%%rcx		\n\t"\
		"movq	%[__cc],%%rsi		\n\t"\
		"movaps	%%xmm0,0x80(%%rsi)	\n\t	movaps	%%xmm8 ,0xa0(%%rsi)	\n\t"\
		"movaps	%%xmm6,0x90(%%rsi)	\n\t	movaps	%%xmm14,0xb0(%%rsi)	\n\t"\
		"addpd	%%xmm1,%%xmm0		\n\t	addpd	%%xmm9 ,%%xmm8  	\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t	movaps	%%xmm13,%%xmm15		\n\t"\
		"addpd	%%xmm2,%%xmm3		\n\t	addpd	%%xmm10,%%xmm11		\n\t"\
		"subpd	%%xmm4,%%xmm5		\n\t	subpd	%%xmm12,%%xmm13		\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t	subpd	%%xmm10,%%xmm9  	\n\t"\
		"subpd	%%xmm7,%%xmm6		\n\t	subpd	%%xmm15,%%xmm14		\n\t"\
		"addpd	%%xmm2,%%xmm2		\n\t	addpd	%%xmm10,%%xmm10		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t	addpd	%%xmm15,%%xmm12		\n\t"\
		"addpd	%%xmm3,%%xmm0		\n\t	addpd	%%xmm11,%%xmm8  	\n\t"\
		"addpd	0x90(%%rsi),%%xmm5	\n\t	addpd	0xb0(%%rsi),%%xmm13	\n\t"\
		"subpd	%%xmm2,%%xmm3		\n\t	subpd	%%xmm10,%%xmm11		\n\t"\
		"movaps	%%xmm4,%%xmm7		\n\t	movaps	%%xmm12,%%xmm15		\n\t"\
		"movaps	%%xmm0,    (%%rcx)	\n\t	movaps	%%xmm8 ,0x10(%%rcx)	\n\t"/* B0 */\
		"subpd	%%xmm6,%%xmm4		\n\t	subpd	%%xmm14,%%xmm12		\n\t"\
		"movaps	%%xmm1,%%xmm2		\n\t	movaps	%%xmm9 ,%%xmm10		\n\t"\
		"subpd	0x80(%%rsi),%%xmm0	\n\t	subpd	0xa0(%%rsi),%%xmm8  \n\t"\
		"mulpd	0x10(%%rsi),%%xmm5	\n\t	mulpd	0x10(%%rsi),%%xmm13	\n\t"\
		"addpd	%%xmm3,%%xmm2		\n\t	addpd	%%xmm11,%%xmm10		\n\t"\
		"mulpd	0x40(%%rsi),%%xmm3	\n\t	mulpd	0x40(%%rsi),%%xmm11	\n\t"\
		"mulpd	0x70(%%rsi),%%xmm4	\n\t	mulpd	0x70(%%rsi),%%xmm12	\n\t"\
		"mulpd	0x20(%%rsi),%%xmm1	\n\t	mulpd	0x20(%%rsi),%%xmm9  \n\t"\
		"mulpd	0x30(%%rsi),%%xmm6	\n\t	mulpd	0x30(%%rsi),%%xmm14	\n\t"\
		"mulpd	    (%%rsi),%%xmm0	\n\t	mulpd	    (%%rsi),%%xmm8  \n\t"\
		"mulpd	0x50(%%rsi),%%xmm7	\n\t	mulpd	0x50(%%rsi),%%xmm15	\n\t"\
		"mulpd	0x60(%%rsi),%%xmm2	\n\t	mulpd	0x60(%%rsi),%%xmm10	\n\t"\
		"addpd	    (%%rcx),%%xmm0	\n\t	addpd	0x10(%%rcx),%%xmm8  \n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t	addpd	%%xmm12,%%xmm14		\n\t"\
		"subpd	%%xmm2,%%xmm1		\n\t	subpd	%%xmm10,%%xmm9  	\n\t"\
		"subpd	%%xmm7,%%xmm4		\n\t	subpd	%%xmm15,%%xmm12		\n\t"\
		"subpd	%%xmm2,%%xmm3		\n\t	subpd	%%xmm10,%%xmm11		\n\t"\
		"movq	%[__o1],%%rax		\n\t"\
		"movq	%[__o2],%%rbx		\n\t"\
		"movq	%[__o3],%%rcx		\n\t"\
		"movq	%[__o4],%%rdx		\n\t"\
		"movq	%[__o5],%%rsi		\n\t"\
		"movq	%[__o6],%%rdi		\n\t"\
		"movaps	%%xmm0,%%xmm2		\n\t	movaps	%%xmm8 ,%%xmm10		\n\t"\
		"movaps	%%xmm5,%%xmm7		\n\t	movaps	%%xmm13,%%xmm15		\n\t"\
		"addpd	%%xmm1,%%xmm0		\n\t	addpd	%%xmm9 ,%%xmm8  	\n\t"\
		"addpd	%%xmm6,%%xmm5		\n\t	addpd	%%xmm14,%%xmm13		\n\t"\
		"addpd	%%xmm3,%%xmm1		\n\t	addpd	%%xmm11,%%xmm9  	\n\t"\
		"addpd	%%xmm4,%%xmm6		\n\t	addpd	%%xmm12,%%xmm14		\n\t"\
		"addpd	%%xmm2,%%xmm3		\n\t	addpd	%%xmm10,%%xmm11		\n\t"\
		"addpd	%%xmm7,%%xmm4		\n\t	addpd	%%xmm15,%%xmm12		\n\t"\
		"subpd	%%xmm1,%%xmm2		\n\t	subpd	%%xmm9 ,%%xmm10		\n\t"\
		"subpd	%%xmm6,%%xmm7		\n\t	subpd	%%xmm14,%%xmm15		\n\t"\
		"/* xmm1,6,9,14 free ... Note the order reversal on the 3rd pair of outputs: */\n\t"\
		"subpd	%%xmm13,%%xmm0		\n\t	subpd	%%xmm15,%%xmm2		\n\t	subpd	%%xmm12,%%xmm3 		\n\t"\
		"subpd	%%xmm5 ,%%xmm8  	\n\t	subpd	%%xmm7 ,%%xmm10		\n\t	subpd	%%xmm4 ,%%xmm11		\n\t"\
		"addpd	%%xmm13,%%xmm13		\n\t	addpd	%%xmm15,%%xmm15		\n\t	addpd	%%xmm12,%%xmm12		\n\t"\
		"addpd	%%xmm5 ,%%xmm5		\n\t	addpd	%%xmm7 ,%%xmm7		\n\t	addpd	%%xmm4 ,%%xmm4 		\n\t"\
		"addpd	%%xmm0 ,%%xmm13		\n\t	addpd	%%xmm2 ,%%xmm15		\n\t	addpd	%%xmm3 ,%%xmm12		\n\t"\
		"addpd	%%xmm8 ,%%xmm5		\n\t	addpd	%%xmm10,%%xmm7		\n\t	addpd	%%xmm11,%%xmm4 		\n\t"\
		"movaps	%%xmm0 ,    (%%rax)	\n\t	movaps	%%xmm2 ,    (%%rbx)	\n\t	movaps	%%xmm3 ,    (%%rdx)	\n\t"/* B124r */\
		"movaps	%%xmm8 ,0x10(%%rdi)	\n\t	movaps	%%xmm10,0x10(%%rsi)	\n\t	movaps	%%xmm11,0x10(%%rcx)	\n\t"/* B653i */\
		"movaps	%%xmm13,    (%%rdi)	\n\t	movaps	%%xmm15,    (%%rsi)	\n\t	movaps	%%xmm12,    (%%rcx)	\n\t"/* B653r */\
		"movaps	%%xmm5 ,0x10(%%rax)	\n\t	movaps	%%xmm7 ,0x10(%%rbx)	\n\t	movaps	%%xmm4 ,0x10(%%rdx)	\n\t"/* B124i */\
		"\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__cc] "m" (Xcc)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	/* Twiddleless version of SSE2_RADIX8_DIF_TWIDDLE. Inputs enter in memory locations __r0 + [__i1,__i2,__i3,__i4,__i5,__i6,__i7],;
	where r0 is a memory address and the i's are LITERAL [BYTE] OFFSETS. Outputs go into memory locations __o0,__o1,__o2,__o3,__o4,__o5,__o6,__o7, assumed disjoint with inputs:\
	*/
	#define SSE2_RADIX8_DIF_0TWIDDLE(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
	{\
	__asm__ volatile (\
	/* 1st of 2 radix-4 subtransforms, data in xmm0-7: **** 2nd of 2 radix-4 subtransforms, data in xmm8-15: */\
		"movq	%[__r0],%%rax	/* i0 = r00 */	\n\t			leaq	%c[__i1](%%rax),%%r10			\n\t"\
		"leaq	%c[__i2](%%rax),%%rbx			\n\t			leaq	%c[__i3](%%rax),%%r11			\n\t"\
		"leaq	%c[__i4](%%rax),%%rcx			\n\t			leaq	%c[__i5](%%rax),%%r12			\n\t"\
		"leaq	%c[__i6](%%rax),%%rdx			\n\t			leaq	%c[__i7](%%rax),%%r13			\n\t"\
		/* p0,4 combo: x+y into xmm0/1, x-y in xmm2/3: **** p1,5 combo: x+y into xmm8/1, x-y in xmm10/3: */\
		"							movq %[__isrt2],%%rsi \n\t	movaps	    (%%r12),%%xmm8 				\n\t"\
		"										\n\t			movaps	0x10(%%r12),%%xmm9 				\n\t"\
		"movaps	    (%%rcx),%%xmm0				\n\t			movaps	    (%%r10),%%xmm10				\n\t"\
		"movaps	0x10(%%rcx),%%xmm1				\n\t			movaps	0x10(%%r10),%%xmm11				\n\t"\
		"movaps	    (%%rax),%%xmm2				\n\t			subpd	%%xmm8 ,%%xmm10					\n\t"\
		"movaps	0x10(%%rax),%%xmm3				\n\t			subpd	%%xmm9 ,%%xmm11					\n\t"\
		"subpd	%%xmm0,%%xmm2					\n\t			addpd	%%xmm8 ,%%xmm8 					\n\t"\
		"subpd	%%xmm1,%%xmm3					\n\t			addpd	%%xmm9 ,%%xmm9 					\n\t"\
		"addpd	%%xmm0,%%xmm0					\n\t			addpd	%%xmm10,%%xmm8 					\n\t"\
		"addpd	%%xmm1,%%xmm1					\n\t			addpd	%%xmm11,%%xmm9 					\n\t"\
		"addpd	%%xmm2,%%xmm0					\n\t"	/* p3,7 combo: x+y into xmm14/7, x-y in xmm12/5: */\
		"addpd	%%xmm3,%%xmm1					\n\t			movaps	    (%%r11),%%xmm12				\n\t"\
		"										\n\t			movaps	0x10(%%r11),%%xmm13				\n\t"\
		/* p2,6 combo: x+y into xmm4/5, x-y in xmm6/7: */"\n\t	movaps	    (%%r13),%%xmm14				\n\t"\
		"										\n\t			movaps	0x10(%%r13),%%xmm15				\n\t"\
		"movaps	    (%%rdx),%%xmm4				\n\t			subpd	%%xmm14,%%xmm12					\n\t"\
		"movaps	0x10(%%rdx),%%xmm5				\n\t			subpd	%%xmm15,%%xmm13					\n\t"\
		"movaps	    (%%rbx),%%xmm6				\n\t			addpd	%%xmm14,%%xmm14					\n\t"\
		"movaps	0x10(%%rbx),%%xmm7				\n\t			addpd	%%xmm15,%%xmm15					\n\t"\
		"subpd	%%xmm4,%%xmm6					\n\t			addpd	%%xmm12,%%xmm14					\n\t"\
		"subpd	%%xmm5,%%xmm7					\n\t			addpd	%%xmm13,%%xmm15					\n\t"\
		"addpd	%%xmm4,%%xmm4					\n\t"	/* Finish radix-4 butterfly, tmp-store 1st of 4 outputs to free up 2 registers: */\
		"addpd	%%xmm5,%%xmm5					\n\t			subpd	%%xmm14,%%xmm8 					\n\t"\
		"addpd	%%xmm6,%%xmm4					\n\t			subpd	%%xmm15,%%xmm9 					\n\t"\
		"addpd	%%xmm7,%%xmm5					\n\t			subpd	%%xmm13,%%xmm10					\n\t"\
		"										\n\t			subpd	%%xmm12,%%xmm11					\n\t"\
		"subpd	%%xmm4,%%xmm0					\n\t			addpd	%%xmm14,%%xmm14					\n\t"\
		"subpd	%%xmm7,%%xmm2					\n\t			addpd	%%xmm13,%%xmm13					\n\t"\
		"subpd	%%xmm5,%%xmm1					\n\t			addpd	%%xmm15,%%xmm15					\n\t"\
		"subpd	%%xmm6,%%xmm3					\n\t			addpd	%%xmm12,%%xmm12					\n\t"\
		"														addpd	%%xmm8 ,%%xmm14					\n\t"\
		"														addpd	%%xmm10,%%xmm13					\n\t"\
		"														addpd	%%xmm9 ,%%xmm15					\n\t"\
		"														addpd	%%xmm11,%%xmm12					\n\t"\
		"addpd	%%xmm4,%%xmm4					\n\t			movaps	%%xmm14,    (%%r10)				\n\t"\
		"addpd	%%xmm7,%%xmm7					\n\t			movaps	%%xmm15,0x10(%%r10)				\n\t"\
		"addpd	%%xmm5,%%xmm5					\n\t			movaps	%%xmm10,%%xmm14					\n\t"\
		"addpd	%%xmm6,%%xmm6					\n\t			movaps	%%xmm13,%%xmm15					\n\t"\
		"addpd	%%xmm0,%%xmm4					\n\t			subpd	%%xmm12,%%xmm10					\n\t"\
		"addpd	%%xmm2,%%xmm7					\n\t			subpd	%%xmm11,%%xmm13					\n\t"\
		"addpd	%%xmm1,%%xmm5					\n\t			addpd	%%xmm14,%%xmm12					\n\t"\
		"addpd	%%xmm3,%%xmm6					\n\t			addpd	%%xmm15,%%xmm11					\n\t"\
		"														movaps	(%%rsi),%%xmm14		\n\t"/* isrt2 */\
		"														mulpd	%%xmm14,%%xmm10					\n\t"\
		"														mulpd	%%xmm14,%%xmm13					\n\t"\
		"														mulpd	%%xmm14,%%xmm12					\n\t"\
		"														mulpd	%%xmm14,%%xmm11					\n\t"\
		"movaps	    (%%r10),%%xmm14	\n\t"/* restore spilled */\
		"movaps	0x10(%%r10),%%xmm15	\n\t"/* restore spilled */\
		"\n\t"\
	/* Inline of SSE2_RADIX8_DIF_COMBINE_RAD4_SUBS_A(r0): Combine radix-4 subtransforms and write outputs: */\
		/***** t0,1,2,3,4,5,6,7 in xmm[ 4, 5| 2, 6| 0, 1| 7, 3] *****/\
		/***** t8,9,a,b,c,d,e,f in xmm[14,15|10,12| 8, 9|13,11] *****/\
		"\n\t"\
		"\n\t"\
		"movq	%[__o4],%%rax					\n\t			subpd   %%xmm10,%%xmm2			\n\t"\
		"movq	%[__o5],%%rbx					\n\t			subpd   %%xmm12,%%xmm6			\n\t"\
		"movq	%[__o6],%%rcx					\n\t			addpd   %%xmm10,%%xmm10			\n\t"\
		"movq	%[__o7],%%rdx					\n\t			addpd   %%xmm12,%%xmm12			\n\t"\
		"subpd   %%xmm11,%%xmm7					\n\t			addpd   %%xmm2,%%xmm10			\n\t"\
		"subpd   %%xmm13,%%xmm3					\n\t			addpd   %%xmm6,%%xmm12			\n\t"\
		"addpd   %%xmm11,%%xmm11				\n\t			movaps	%%xmm2 ,    (%%rbx)		\n\t"/* o5r */\
		"addpd   %%xmm13,%%xmm13				\n\t			movaps	%%xmm6 ,0x10(%%rbx)		\n\t"/* o5i */\
		"addpd   %%xmm7,%%xmm11					\n\t			movaps	%%xmm10,    (%%rax)		\n\t"/* o4r */\
		"addpd   %%xmm3,%%xmm13					\n\t			movaps	%%xmm12,0x10(%%rax)		\n\t"/* o4i */\
		"movaps	%%xmm7 ,    (%%rcx)		\n\t"/* o6r */\
		"movaps	%%xmm3 ,0x10(%%rdx)		\n\t"/* o7i */\
		"movaps	%%xmm11,    (%%rdx)		\n\t"/* o7r */\
		"movaps	%%xmm13,0x10(%%rcx)		\n\t"/* o6i */\
		"movq	%[__o0],%%rax					\n\t"\
		"movq	%[__o1],%%rbx					\n\t"\
		"movq	%[__o2],%%rcx					\n\t"\
		"movq	%[__o3],%%rdx					\n\t"\
		"subpd	%%xmm14,%%xmm4 					\n\t"\
		"subpd	%%xmm15,%%xmm5 					\n\t"\
		"subpd	%%xmm9 ,%%xmm0 					\n\t"\
		"subpd	%%xmm8 ,%%xmm1 					\n\t"\
		"addpd	%%xmm14,%%xmm14					\n\t			movaps	%%xmm4 ,    (%%rbx)		\n\t"/* o1r */\
		"addpd	%%xmm15,%%xmm15					\n\t			movaps	%%xmm5 ,0x10(%%rbx)		\n\t"/* o1i */\
		"addpd	%%xmm9 ,%%xmm9 					\n\t			movaps	%%xmm0 ,    (%%rcx)		\n\t"/* o2r */\
		"addpd	%%xmm8 ,%%xmm8 					\n\t			movaps	%%xmm1 ,0x10(%%rdx)		\n\t"/* o3i */\
		"addpd	%%xmm4 ,%%xmm14					\n\t"\
		"addpd	%%xmm5 ,%%xmm15					\n\t"\
		"addpd	%%xmm0 ,%%xmm9 					\n\t"\
		"addpd	%%xmm1 ,%%xmm8 					\n\t"\
		"movaps	%%xmm14,    (%%rax)		\n\t"/* o0r */\
		"movaps	%%xmm15,0x10(%%rax)		\n\t"/* o0r */\
		"movaps	%%xmm9 ,    (%%rdx)		\n\t"/* o3r */\
		"movaps	%%xmm8 ,0x10(%%rcx)		\n\t"/* o2i */\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "e" (Xi1)\
		 ,[__i2] "e" (Xi2)\
		 ,[__i3] "e" (Xi3)\
		 ,[__i4] "e" (Xi4)\
		 ,[__i5] "e" (Xi5)\
		 ,[__i6] "e" (Xi6)\
		 ,[__i7] "e" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Need a 2nd version of above which takes the i-strides as intvars rather than literal bytes:
	#define SSE2_RADIX8_DIF_0TWIDDLE_B(Xr0, Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
	{\
	__asm__ volatile (\
		"/* 1st of 2 radix-4 subtransforms, data in xmm0-7: */\n\t	/* 2nd of 2 radix-4 subtransforms, data in xmm8-15: */\n\t"\
		"movq	%[__r0],%%rax	/* i0 = r00 */	\n\t			movslq	%[__i1],%%r10		/* i1 */	\n\t"\
		"movslq	%[__i2],%%rbx	/* i2 */		\n\t			movslq	%[__i3],%%r11		/* i3 */	\n\t"\
		"movslq	%[__i4],%%rcx	/* i4 */		\n\t			movslq	%[__i5],%%r12		/* i5 */	\n\t"\
		"movslq	%[__i6],%%rdx	/* i6 */		\n\t			movslq	%[__i7],%%r13		/* i7 */	\n\t"\
		"addq	%%rax,%%rbx						\n\t			addq	%%rax,%%r10						\n\t"\
		"addq	%%rax,%%rcx						\n\t			addq	%%rax,%%r11						\n\t"\
		"addq	%%rax,%%rdx						\n\t			addq	%%rax,%%r12						\n\t"\
		"movq	%[__isrt2],%%rsi				\n\t			addq	%%rax,%%r13						\n\t"\
		"										\n\t			/* p1,5 combo: x+y into xmm8 /1, x-y in xmm10/3: */	\n\t"\
		"/* p0,4 combo: x+y into xmm0/1, x-y in xmm2/3: */\n\t	movaps	    (%%r12),%%xmm8 				\n\t"\
		"										\n\t			movaps	0x10(%%r12),%%xmm9 				\n\t"\
		"movaps	    (%%rcx),%%xmm0				\n\t			movaps	    (%%r10),%%xmm10				\n\t"\
		"movaps	0x10(%%rcx),%%xmm1				\n\t			movaps	0x10(%%r10),%%xmm11				\n\t"\
		"movaps	    (%%rax),%%xmm2				\n\t			subpd	%%xmm8 ,%%xmm10					\n\t"\
		"movaps	0x10(%%rax),%%xmm3				\n\t			subpd	%%xmm9 ,%%xmm11					\n\t"\
		"subpd	%%xmm0,%%xmm2					\n\t			addpd	%%xmm8 ,%%xmm8 					\n\t"\
		"subpd	%%xmm1,%%xmm3					\n\t			addpd	%%xmm9 ,%%xmm9 					\n\t"\
		"addpd	%%xmm0,%%xmm0					\n\t			addpd	%%xmm10,%%xmm8 					\n\t"\
		"addpd	%%xmm1,%%xmm1					\n\t			addpd	%%xmm11,%%xmm9 					\n\t"\
		"addpd	%%xmm2,%%xmm0					\n\t			/* p3,7 combo: x+y into xmm14/7, x-y in xmm12/5: */	\n\t"\
		"addpd	%%xmm3,%%xmm1					\n\t			movaps	    (%%r11),%%xmm12				\n\t"\
		"										\n\t			movaps	0x10(%%r11),%%xmm13				\n\t"\
		"/* p2,6 combo: x+y into xmm4/5, x-y in xmm6/7: */\n\t	movaps	    (%%r13),%%xmm14				\n\t"\
		"										\n\t			movaps	0x10(%%r13),%%xmm15				\n\t"\
		"movaps	    (%%rdx),%%xmm4				\n\t			subpd	%%xmm14,%%xmm12					\n\t"\
		"movaps	0x10(%%rdx),%%xmm5				\n\t			subpd	%%xmm15,%%xmm13					\n\t"\
		"movaps	    (%%rbx),%%xmm6				\n\t			addpd	%%xmm14,%%xmm14					\n\t"\
		"movaps	0x10(%%rbx),%%xmm7				\n\t			addpd	%%xmm15,%%xmm15					\n\t"\
		"subpd	%%xmm4,%%xmm6					\n\t			addpd	%%xmm12,%%xmm14					\n\t"\
		"subpd	%%xmm5,%%xmm7					\n\t			addpd	%%xmm13,%%xmm15					\n\t"\
		"addpd	%%xmm4,%%xmm4					\n\t			/* Finish radix-4 butterfly, tmp-store 1st of 4 outputs to free up 2 registers: */\n\t"\
		"addpd	%%xmm5,%%xmm5					\n\t			subpd	%%xmm14,%%xmm8 					\n\t"\
		"addpd	%%xmm6,%%xmm4					\n\t			subpd	%%xmm15,%%xmm9 					\n\t"\
		"addpd	%%xmm7,%%xmm5					\n\t			subpd	%%xmm13,%%xmm10					\n\t"\
		"										\n\t			subpd	%%xmm12,%%xmm11					\n\t"\
		"subpd	%%xmm4,%%xmm0					\n\t			addpd	%%xmm14,%%xmm14					\n\t"\
		"subpd	%%xmm7,%%xmm2					\n\t			addpd	%%xmm13,%%xmm13					\n\t"\
		"subpd	%%xmm5,%%xmm1					\n\t			addpd	%%xmm15,%%xmm15					\n\t"\
		"subpd	%%xmm6,%%xmm3					\n\t			addpd	%%xmm12,%%xmm12					\n\t"\
		"														addpd	%%xmm8 ,%%xmm14					\n\t"\
		"														addpd	%%xmm10,%%xmm13					\n\t"\
		"														addpd	%%xmm9 ,%%xmm15					\n\t"\
		"														addpd	%%xmm11,%%xmm12					\n\t"\
		"addpd	%%xmm4,%%xmm4					\n\t			movaps	%%xmm14,    (%%r10)				\n\t"\
		"addpd	%%xmm7,%%xmm7					\n\t			movaps	%%xmm15,0x10(%%r10)				\n\t"\
		"addpd	%%xmm5,%%xmm5					\n\t			movaps	%%xmm10,%%xmm14					\n\t"\
		"addpd	%%xmm6,%%xmm6					\n\t			movaps	%%xmm13,%%xmm15					\n\t"\
		"addpd	%%xmm0,%%xmm4					\n\t			subpd	%%xmm12,%%xmm10					\n\t"\
		"addpd	%%xmm2,%%xmm7					\n\t			subpd	%%xmm11,%%xmm13					\n\t"\
		"addpd	%%xmm1,%%xmm5					\n\t			addpd	%%xmm14,%%xmm12					\n\t"\
		"addpd	%%xmm3,%%xmm6					\n\t			addpd	%%xmm15,%%xmm11					\n\t"\
		"														movaps	(%%rsi),%%xmm14	/* isrt2 */		\n\t"\
		"														mulpd	%%xmm14,%%xmm10					\n\t"\
		"														mulpd	%%xmm14,%%xmm13					\n\t"\
		"														mulpd	%%xmm14,%%xmm12					\n\t"\
		"										\n\t			mulpd	%%xmm14,%%xmm11					\n\t"\
		"movaps	    (%%r10),%%xmm14	/* restore spilled */\n\t"\
		"movaps	0x10(%%r10),%%xmm15	/* restore spilled */\n\t"\
		"										\n\t"\
		"/* Inline of SSE2_RADIX8_DIF_COMBINE_RAD4_SUBS_A(r0): Combine radix-4 subtransforms and write outputs: */\n\t"\
		"/***** t0,1,2,3,4,5,6,7 in xmm[ 4, 5| 2, 6| 0, 1| 7, 3] *****/\n\t"\
		"/***** t8,9,a,b,c,d,e,f in xmm[14,15|10,12| 8, 9|13,11] */\n\t"\
		"movq	%[__o4],%%rax					\n\t			subpd   %%xmm10,%%xmm2					\n\t"\
		"movq	%[__o5],%%rbx					\n\t			subpd   %%xmm12,%%xmm6					\n\t"\
		"movq	%[__o6],%%rcx					\n\t			addpd   %%xmm10,%%xmm10					\n\t"\
		"movq	%[__o7],%%rdx					\n\t			addpd   %%xmm12,%%xmm12					\n\t"\
		"										\n\t			addpd   %%xmm2,%%xmm10					\n\t"\
		"subpd   %%xmm11,%%xmm7					\n\t			addpd   %%xmm6,%%xmm12					\n\t"\
		"subpd   %%xmm13,%%xmm3					\n\t													\n\t"\
		"addpd   %%xmm11,%%xmm11				\n\t			movaps	%%xmm2 ,    (%%rbx)	/* o5r */	\n\t"\
		"addpd   %%xmm13,%%xmm13				\n\t			movaps	%%xmm6 ,0x10(%%rbx)	/* o5i */	\n\t"\
		"addpd   %%xmm7,%%xmm11					\n\t			movaps	%%xmm10,    (%%rax)	/* o4r */	\n\t"\
		"addpd   %%xmm3,%%xmm13					\n\t			movaps	%%xmm12,0x10(%%rax)	/* o4i */	\n\t"\
		"										\n\t"\
		"movaps	%%xmm7 ,    (%%rcx)	/* o6r */	\n\t"\
		"movaps	%%xmm3 ,0x10(%%rdx)	/* o7i */	\n\t"\
		"movaps	%%xmm11,    (%%rdx)	/* o7r */	\n\t"\
		"movaps	%%xmm13,0x10(%%rcx)	/* o6i */	\n\t"\
		"										\n\t"\
		"movq	%[__o0],%%rax					\n\t"\
		"movq	%[__o1],%%rbx					\n\t"\
		"movq	%[__o2],%%rcx					\n\t"\
		"movq	%[__o3],%%rdx					\n\t"\
		"										\n\t"\
		"subpd	%%xmm14,%%xmm4 					\n\t"\
		"subpd	%%xmm15,%%xmm5 					\n\t"\
		"subpd	%%xmm9 ,%%xmm0 					\n\t"\
		"subpd	%%xmm8 ,%%xmm1 					\n\t"\
		"addpd	%%xmm14,%%xmm14					\n\t			movaps	%%xmm4 ,    (%%rbx)	/* o1r */	\n\t"\
		"addpd	%%xmm15,%%xmm15					\n\t			movaps	%%xmm5 ,0x10(%%rbx)	/* o1i */	\n\t"\
		"addpd	%%xmm9 ,%%xmm9 					\n\t			movaps	%%xmm0 ,    (%%rcx)	/* o2r */	\n\t"\
		"addpd	%%xmm8 ,%%xmm8 					\n\t			movaps	%%xmm1 ,0x10(%%rdx)	/* o3i */	\n\t"\
		"addpd	%%xmm4 ,%%xmm14					\n\t"\
		"addpd	%%xmm5 ,%%xmm15					\n\t"\
		"addpd	%%xmm0 ,%%xmm9 					\n\t"\
		"addpd	%%xmm1 ,%%xmm8 					\n\t"\
		"										\n\t"\
		"movaps	%%xmm14,    (%%rax)	/* o0r */	\n\t"\
		"movaps	%%xmm15,0x10(%%rax)	/* o0r */	\n\t"\
		"movaps	%%xmm9 ,    (%%rdx)	/* o3r */	\n\t"\
		"movaps	%%xmm8 ,0x10(%%rcx)	/* o2i */	\n\t"\
		"										\n\t"\
		:					/* outputs: none */\
		: [__r0] "m" (Xr0)	/* All inputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	/* Twiddleless version of SSE2_RADIX8_DIT_TWIDDLE. Inputs enter in memory locations __i0,__i1,__i2,__i3,__i4,__i5,__i6,__i7.
	Outputs go into 16 contiguous 32-byte memory locations starting at __out and assumed disjoint with inputs.
	This macro built on the same code template as SSE2_RADIX8_DIF_TWIDDLE0, but with the I/O-location indices mutually bit reversed:
	01234567 <--> 04261537, which can be effected via the pairwise swaps 1 <--> 4 and 3 <--> 6.
	*/
	#define	SSE2_RADIX8_DIT_0TWIDDLE(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xout, Xisrt2)\
	{\
	__asm__ volatile (\
		"/* 1st of 2 radix-4 subtransforms, data in xmm0-7: */\n\t	/* 2nd of 2 radix-4 subtransforms, data in xmm8-15: */\n\t"\
		"movq	%[__i0],%%rax					\n\t			movq	%[__i4],%%r10					\n\t"\
		"movq	%[__i1],%%rbx					\n\t			movq	%[__i5],%%r11					\n\t"\
		"movq	%[__i2],%%rcx					\n\t			movq	%[__i6],%%r12					\n\t"\
		"movq	%[__i3],%%rdx					\n\t			movq	%[__i7],%%r13					\n\t"\
		"										\n\t			/* p1,5 combo: x+y into xmm8 /1, x-y in xmm10/3: */	\n\t"\
		"/* p0,4 combo: x+y into xmm0/1, x-y in xmm2/3: */\n\t	movaps	    (%%r11),%%xmm8 				\n\t"\
		"										\n\t			movaps	0x10(%%r11),%%xmm9 				\n\t"\
		"movaps	    (%%rbx),%%xmm0				\n\t			movaps	    (%%r10),%%xmm10				\n\t"\
		"movaps	0x10(%%rbx),%%xmm1				\n\t			movaps	0x10(%%r10),%%xmm11				\n\t"\
		"movaps	    (%%rax),%%xmm2				\n\t			subpd	%%xmm8 ,%%xmm10					\n\t"\
		"movaps	0x10(%%rax),%%xmm3				\n\t			subpd	%%xmm9 ,%%xmm11					\n\t"\
		"subpd	%%xmm0,%%xmm2					\n\t			addpd	%%xmm8 ,%%xmm8 					\n\t"\
		"subpd	%%xmm1,%%xmm3					\n\t			addpd	%%xmm9 ,%%xmm9 					\n\t"\
		"addpd	%%xmm0,%%xmm0					\n\t			addpd	%%xmm10,%%xmm8 					\n\t"\
		"addpd	%%xmm1,%%xmm1					\n\t			addpd	%%xmm11,%%xmm9 					\n\t"\
		"addpd	%%xmm2,%%xmm0					\n\t			/* p3,7 combo: x+y into xmm14/7, x-y in xmm12/5: */	\n\t"\
		"addpd	%%xmm3,%%xmm1					\n\t			movaps	    (%%r12),%%xmm12				\n\t"\
		"										\n\t			movaps	0x10(%%r12),%%xmm13				\n\t"\
		"/* p2,6 combo: x+y into xmm4/5, x-y in xmm6/7: */\n\t	movaps	    (%%r13),%%xmm14				\n\t"\
		"										\n\t			movaps	0x10(%%r13),%%xmm15				\n\t"\
		"movaps	    (%%rdx),%%xmm4				\n\t			subpd	%%xmm14,%%xmm12					\n\t"\
		"movaps	0x10(%%rdx),%%xmm5				\n\t			subpd	%%xmm15,%%xmm13					\n\t"\
		"movaps	    (%%rcx),%%xmm6				\n\t			addpd	%%xmm14,%%xmm14					\n\t"\
		"movaps	0x10(%%rcx),%%xmm7				\n\t			addpd	%%xmm15,%%xmm15					\n\t"\
		"subpd	%%xmm4,%%xmm6					\n\t			addpd	%%xmm12,%%xmm14					\n\t"\
		"subpd	%%xmm5,%%xmm7					\n\t			addpd	%%xmm13,%%xmm15					\n\t"\
		"addpd	%%xmm4,%%xmm4					\n\t			/* Finish radix-4 butterfly, tmp-store 1st of 4 outputs to free up 2 registers: */\n\t"\
		"addpd	%%xmm5,%%xmm5					\n\t			subpd	%%xmm14,%%xmm8 					\n\t"\
		"addpd	%%xmm6,%%xmm4					\n\t			subpd	%%xmm15,%%xmm9 					\n\t"\
		"addpd	%%xmm7,%%xmm5					\n\t			subpd	%%xmm13,%%xmm10					\n\t"\
		"										\n\t			subpd	%%xmm12,%%xmm11					\n\t"\
		"subpd	%%xmm4,%%xmm0					\n\t			addpd	%%xmm14,%%xmm14					\n\t"\
		"subpd	%%xmm7,%%xmm2					\n\t			addpd	%%xmm13,%%xmm13					\n\t"\
		"subpd	%%xmm5,%%xmm1					\n\t			addpd	%%xmm15,%%xmm15					\n\t"\
		"subpd	%%xmm6,%%xmm3					\n\t			addpd	%%xmm12,%%xmm12					\n\t"\
		"														addpd	%%xmm8 ,%%xmm14					\n\t"\
		"														addpd	%%xmm10,%%xmm13					\n\t"\
		"														addpd	%%xmm9 ,%%xmm15					\n\t"\
		"														addpd	%%xmm11,%%xmm12					\n\t"\
		"														movq	%[__isrt2],%%rsi	/* isrt2 */	\n\t"\
		"addpd	%%xmm4,%%xmm4					\n\t			movaps	%%xmm14,    (%%rax)	/* spill */	\n\t"\
		"addpd	%%xmm7,%%xmm7					\n\t			movaps	%%xmm15,0x10(%%rax)	/* spill */	\n\t"\
		"addpd	%%xmm5,%%xmm5					\n\t			movaps	%%xmm10,%%xmm14					\n\t"\
		"addpd	%%xmm6,%%xmm6					\n\t			movaps	%%xmm13,%%xmm15					\n\t"\
		"addpd	%%xmm0,%%xmm4					\n\t			subpd	%%xmm12,%%xmm10					\n\t"\
		"addpd	%%xmm2,%%xmm7					\n\t			subpd	%%xmm11,%%xmm13					\n\t"\
		"addpd	%%xmm1,%%xmm5					\n\t			addpd	%%xmm14,%%xmm12					\n\t"\
		"addpd	%%xmm3,%%xmm6					\n\t			addpd	%%xmm15,%%xmm11					\n\t"\
		"														movaps	(%%rsi),%%xmm14		/* isrt2 */	\n\t"\
		"														mulpd	%%xmm14,%%xmm10					\n\t"\
		"														mulpd	%%xmm14,%%xmm13					\n\t"\
		"														mulpd	%%xmm14,%%xmm12					\n\t"\
		"														mulpd	%%xmm14,%%xmm11					\n\t"\
		"/* Combine radix-4 subtransforms and write outputs: */\n\t"\
		"\n\t"\
		"movaps	    (%%rax),%%xmm14	/* restore spilled */\n\t	subpd   %%xmm10,%%xmm2					\n\t"\
		"movaps	0x10(%%rax),%%xmm15	/* restore spilled */\n\t	subpd   %%xmm12,%%xmm6					\n\t"\
		"														addpd   %%xmm10,%%xmm10					\n\t"\
		"movq	%[__out],%%rax					\n\t			addpd   %%xmm12,%%xmm12					\n\t"\
		"										\n\t			addpd   %%xmm2,%%xmm10					\n\t"\
		"subpd   %%xmm11,%%xmm7					\n\t			addpd   %%xmm6,%%xmm12					\n\t"\
		"subpd   %%xmm13,%%xmm3					\n\t													\n\t"\
		"addpd   %%xmm11,%%xmm11				\n\t			movaps	%%xmm2 ,0xa0(%%rax)	/* o5r */	\n\t"\
		"addpd   %%xmm13,%%xmm13				\n\t			movaps	%%xmm6 ,0xb0(%%rax)	/* o5i */	\n\t"\
		"addpd   %%xmm7,%%xmm11					\n\t			movaps	%%xmm10,0x20(%%rax)	/* o1r */	\n\t"\
		"addpd   %%xmm3,%%xmm13					\n\t			movaps	%%xmm12,0x30(%%rax)	/* o1i */	\n\t"\
		"										\n\t"\
		"movaps	%%xmm7 ,0x60(%%rax)	/* o3r */	\n\t"\
		"movaps	%%xmm3 ,0xf0(%%rax)	/* o7i */	\n\t"\
		"movaps	%%xmm11,0xe0(%%rax)	/* o7r */	\n\t"\
		"movaps	%%xmm13,0x70(%%rax)	/* o3i */	\n\t"\
		"										\n\t"\
		"subpd	%%xmm14,%%xmm4 					\n\t"\
		"subpd	%%xmm15,%%xmm5 					\n\t"\
		"subpd	%%xmm9 ,%%xmm0 					\n\t"\
		"subpd	%%xmm8 ,%%xmm1 					\n\t"\
		"addpd	%%xmm14,%%xmm14					\n\t			movaps	%%xmm4 ,0x80(%%rax)	/* o4r */	\n\t"\
		"addpd	%%xmm15,%%xmm15					\n\t			movaps	%%xmm5 ,0x90(%%rax)	/* o4i */	\n\t"\
		"addpd	%%xmm9 ,%%xmm9 					\n\t			movaps	%%xmm0 ,0x40(%%rax)	/* o2r */	\n\t"\
		"addpd	%%xmm8 ,%%xmm8 					\n\t			movaps	%%xmm1 ,0xd0(%%rax)	/* o6i */	\n\t"\
		"addpd	%%xmm4 ,%%xmm14					\n\t"\
		"addpd	%%xmm5 ,%%xmm15					\n\t"\
		"addpd	%%xmm0 ,%%xmm9 					\n\t"\
		"addpd	%%xmm1 ,%%xmm8 					\n\t"\
		"										\n\t"\
		"movaps	%%xmm14,    (%%rax)	/* o0r */	\n\t"\
		"movaps	%%xmm15,0x10(%%rax)	/* o0r */	\n\t"\
		"movaps	%%xmm9 ,0xc0(%%rax)	/* o6r */	\n\t"\
		"movaps	%%xmm8 ,0x50(%%rax)	/* o2i */	\n\t"\
		"										\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__out] "m" (Xout)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// Same as SSE2_RADIX8_DIT_0TWIDDLE but with user-specifiable [i.e. not nec. contiguous] output addresses:
	#define	SSE2_RADIX8_DIT_0TWIDDLE_OOP(Xi0,Xi1,Xi2,Xi3,Xi4,Xi5,Xi6,Xi7, Xo0,Xo1,Xo2,Xo3,Xo4,Xo5,Xo6,Xo7, Xisrt2)\
	{\
	__asm__ volatile (\
		"/* 1st of 2 radix-4 subtransforms, data in xmm0-7: */\n\t	/* 2nd of 2 radix-4 subtransforms, data in xmm8-15: */\n\t"\
		"movq	%[__i0],%%rax					\n\t			movq	%[__i4],%%r10					\n\t"\
		"movq	%[__i1],%%rbx					\n\t			movq	%[__i5],%%r11					\n\t"\
		"movq	%[__i2],%%rcx					\n\t			movq	%[__i6],%%r12					\n\t"\
		"movq	%[__i3],%%rdx					\n\t			movq	%[__i7],%%r13					\n\t"\
		"										\n\t			/* p1,5 combo: x+y into xmm8 /1, x-y in xmm10/3: */	\n\t"\
		"/* p0,4 combo: x+y into xmm0/1, x-y in xmm2/3: */\n\t	movaps	    (%%r11),%%xmm8 				\n\t"\
		"										\n\t			movaps	0x10(%%r11),%%xmm9 				\n\t"\
		"movaps	    (%%rbx),%%xmm0				\n\t			movaps	    (%%r10),%%xmm10				\n\t"\
		"movaps	0x10(%%rbx),%%xmm1				\n\t			movaps	0x10(%%r10),%%xmm11				\n\t"\
		"movaps	    (%%rax),%%xmm2				\n\t			subpd	%%xmm8 ,%%xmm10					\n\t"\
		"movaps	0x10(%%rax),%%xmm3				\n\t			subpd	%%xmm9 ,%%xmm11					\n\t"\
		"subpd	%%xmm0,%%xmm2					\n\t			addpd	%%xmm8 ,%%xmm8 					\n\t"\
		"subpd	%%xmm1,%%xmm3					\n\t			addpd	%%xmm9 ,%%xmm9 					\n\t"\
		"addpd	%%xmm0,%%xmm0					\n\t			addpd	%%xmm10,%%xmm8 					\n\t"\
		"addpd	%%xmm1,%%xmm1					\n\t			addpd	%%xmm11,%%xmm9 					\n\t"\
		"addpd	%%xmm2,%%xmm0					\n\t			/* p3,7 combo: x+y into xmm14/7, x-y in xmm12/5: */	\n\t"\
		"addpd	%%xmm3,%%xmm1					\n\t			movaps	    (%%r12),%%xmm12				\n\t"\
		"										\n\t			movaps	0x10(%%r12),%%xmm13				\n\t"\
		"/* p2,6 combo: x+y into xmm4/5, x-y in xmm6/7: */\n\t	movaps	    (%%r13),%%xmm14				\n\t"\
		"										\n\t			movaps	0x10(%%r13),%%xmm15				\n\t"\
		"movaps	    (%%rdx),%%xmm4				\n\t			subpd	%%xmm14,%%xmm12					\n\t"\
		"movaps	0x10(%%rdx),%%xmm5				\n\t			subpd	%%xmm15,%%xmm13					\n\t"\
		"movaps	    (%%rcx),%%xmm6				\n\t			addpd	%%xmm14,%%xmm14					\n\t"\
		"movaps	0x10(%%rcx),%%xmm7				\n\t			addpd	%%xmm15,%%xmm15					\n\t"\
		"subpd	%%xmm4,%%xmm6					\n\t			addpd	%%xmm12,%%xmm14					\n\t"\
		"subpd	%%xmm5,%%xmm7					\n\t			addpd	%%xmm13,%%xmm15					\n\t"\
		"addpd	%%xmm4,%%xmm4					\n\t			/* Finish radix-4 butterfly, tmp-store 1st of 4 outputs to free up 2 registers: */\n\t"\
		"addpd	%%xmm5,%%xmm5					\n\t			subpd	%%xmm14,%%xmm8 					\n\t"\
		"addpd	%%xmm6,%%xmm4					\n\t			subpd	%%xmm15,%%xmm9 					\n\t"\
		"addpd	%%xmm7,%%xmm5					\n\t			subpd	%%xmm13,%%xmm10					\n\t"\
		"										\n\t			subpd	%%xmm12,%%xmm11					\n\t"\
		"subpd	%%xmm4,%%xmm0					\n\t			addpd	%%xmm14,%%xmm14					\n\t"\
		"subpd	%%xmm7,%%xmm2					\n\t			addpd	%%xmm13,%%xmm13					\n\t"\
		"subpd	%%xmm5,%%xmm1					\n\t			addpd	%%xmm15,%%xmm15					\n\t"\
		"subpd	%%xmm6,%%xmm3					\n\t			addpd	%%xmm12,%%xmm12					\n\t"\
		"														addpd	%%xmm8 ,%%xmm14					\n\t"\
		"														addpd	%%xmm10,%%xmm13					\n\t"\
		"														addpd	%%xmm9 ,%%xmm15					\n\t"\
		"														addpd	%%xmm11,%%xmm12					\n\t"\
		"														movq	%[__isrt2],%%rsi	/* isrt2 */	\n\t"\
		"addpd	%%xmm4,%%xmm4					\n\t			movaps	%%xmm14,    (%%rax)	/* spill */	\n\t"\
		"addpd	%%xmm7,%%xmm7					\n\t			movaps	%%xmm15,0x10(%%rax)	/* spill */	\n\t"\
		"addpd	%%xmm5,%%xmm5					\n\t			movaps	%%xmm10,%%xmm14					\n\t"\
		"addpd	%%xmm6,%%xmm6					\n\t			movaps	%%xmm13,%%xmm15					\n\t"\
		"addpd	%%xmm0,%%xmm4					\n\t			subpd	%%xmm12,%%xmm10					\n\t"\
		"addpd	%%xmm2,%%xmm7					\n\t			subpd	%%xmm11,%%xmm13					\n\t"\
		"addpd	%%xmm1,%%xmm5					\n\t			addpd	%%xmm14,%%xmm12					\n\t"\
		"addpd	%%xmm3,%%xmm6					\n\t			addpd	%%xmm15,%%xmm11					\n\t"\
		"														movaps	(%%rsi),%%xmm14		/* isrt2 */	\n\t"\
		"														mulpd	%%xmm14,%%xmm10					\n\t"\
		"														mulpd	%%xmm14,%%xmm13					\n\t"\
		"														mulpd	%%xmm14,%%xmm12					\n\t"\
		"														mulpd	%%xmm14,%%xmm11					\n\t"\
		"/* Combine radix-4 subtransforms and write outputs: */\n\t"\
		"\n\t"\
		"movaps	    (%%rax),%%xmm14	/* restore spilled */\n\t	subpd   %%xmm10,%%xmm2					\n\t"\
		"movaps	0x10(%%rax),%%xmm15	/* restore spilled */\n\t	subpd   %%xmm12,%%xmm6					\n\t"\
		"movq	%[__o1],%%rax					\n\t			movq	%[__o5],%%rcx					\n\t"\
		"														addpd   %%xmm10,%%xmm10					\n\t"\
		"										\n\t			addpd   %%xmm12,%%xmm12					\n\t"\
		"										\n\t			addpd   %%xmm2,%%xmm10					\n\t"\
		"subpd   %%xmm11,%%xmm7					\n\t			addpd   %%xmm6,%%xmm12					\n\t"\
		"subpd   %%xmm13,%%xmm3					\n\t													\n\t"\
		"movq	%[__o3],%%rbx					\n\t			movq	%[__o7],%%rdx					\n\t"\
		"addpd   %%xmm11,%%xmm11				\n\t			movaps	%%xmm2 ,    (%%rcx)	/* o5r */	\n\t"\
		"addpd   %%xmm13,%%xmm13				\n\t			movaps	%%xmm6 ,0x10(%%rcx)	/* o5i */	\n\t"\
		"addpd   %%xmm7,%%xmm11					\n\t			movaps	%%xmm10,    (%%rax)	/* o1r */	\n\t"\
		"addpd   %%xmm3,%%xmm13					\n\t			movaps	%%xmm12,0x10(%%rax)	/* o1i */	\n\t"\
		"movq	%[__o0],%%rax					\n\t			movq	%[__o4],%%rcx					\n\t"\
		"										\n\t"\
		"movaps	%%xmm7 ,    (%%rbx)	/* o3r */	\n\t"\
		"movaps	%%xmm3 ,0x10(%%rdx)	/* o7i */	\n\t"\
		"movaps	%%xmm11,    (%%rdx)	/* o7r */	\n\t"\
		"movaps	%%xmm13,0x10(%%rbx)	/* o3i */	\n\t"\
		"										\n\t"\
		"movq	%[__o2],%%rbx					\n\t			movq	%[__o6],%%rdx					\n\t"\
		"subpd	%%xmm14,%%xmm4 					\n\t"\
		"subpd	%%xmm15,%%xmm5 					\n\t"\
		"subpd	%%xmm9 ,%%xmm0 					\n\t"\
		"subpd	%%xmm8 ,%%xmm1 					\n\t"\
		"addpd	%%xmm14,%%xmm14					\n\t			movaps	%%xmm4 ,    (%%rcx)	/* o4r */	\n\t"\
		"addpd	%%xmm15,%%xmm15					\n\t			movaps	%%xmm5 ,0x10(%%rcx)	/* o4i */	\n\t"\
		"addpd	%%xmm9 ,%%xmm9 					\n\t			movaps	%%xmm0 ,    (%%rbx)	/* o2r */	\n\t"\
		"addpd	%%xmm8 ,%%xmm8 					\n\t			movaps	%%xmm1 ,0x10(%%rdx)	/* o6i */	\n\t"\
		"addpd	%%xmm4 ,%%xmm14					\n\t"\
		"addpd	%%xmm5 ,%%xmm15					\n\t"\
		"addpd	%%xmm0 ,%%xmm9 					\n\t"\
		"addpd	%%xmm1 ,%%xmm8 					\n\t"\
		"										\n\t"\
		"movaps	%%xmm14,    (%%rax)	/* o0r */	\n\t"\
		"movaps	%%xmm15,0x10(%%rax)	/* o0i */	\n\t"\
		"movaps	%%xmm9 ,    (%%rdx)	/* o6r */	\n\t"\
		"movaps	%%xmm8 ,0x10(%%rbx)	/* o2i */	\n\t"\
		"										\n\t"\
		:					/* outputs: none */\
		: [__i0] "m" (Xi0)	/* All iputs from memory addresses here */\
		 ,[__i1] "m" (Xi1)\
		 ,[__i2] "m" (Xi2)\
		 ,[__i3] "m" (Xi3)\
		 ,[__i4] "m" (Xi4)\
		 ,[__i5] "m" (Xi5)\
		 ,[__i6] "m" (Xi6)\
		 ,[__i7] "m" (Xi7)\
		 ,[__o0] "m" (Xo0)\
		 ,[__o1] "m" (Xo1)\
		 ,[__o2] "m" (Xo2)\
		 ,[__o3] "m" (Xo3)\
		 ,[__o4] "m" (Xo4)\
		 ,[__o5] "m" (Xo5)\
		 ,[__o6] "m" (Xo6)\
		 ,[__o7] "m" (Xo7)\
		 ,[__isrt2] "m" (Xisrt2)\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","r10","r11","r12","r13","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"		/* Clobbered registers */\
	);\
	}

	// SSE2 analog of dft_macro.h::RADIX_08_DIF_TWIDDLE_OOP - Result of adding separate I/O addressing to
	// radix8_dif_dit_pass_gcc64.h::SSE2_RADIX8_DIF_TWIDDLE:
	/* Dec 2020: Need to cut #args for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid
	multiple versions of the macro having different arglists.
	Inputs i0-7 are always ptrs to BRed addresses add[0,4,2,6,1,5,3,7], thus we need the following bfly-address-pairs:
		lcol:			rcol:
		i0,4 = add0,1	i1,5 = add4,5
		i2,6 = add2,3	i3,7 = add6,7
	i.e. within each column we have linear address-access: lcol = in0+[0,i1,i2,i3], rcol = [in0+i4]+[0,i1,i2,i3]
	Thus, in the #arg-reduced version of the macro, input-addresses are computed from in0 and [i1,i2,i4],
	where in0 is a memory base-address and the i's are LITERAL [BYTE] OFFSETS.

	Output addresses are nonlinear first due to our main-array index-padding scheme used to avoid cache conflicts,
	and secondly because e.g. radix-192,320 use permuted O-address octets, so in place of the previous
	passing of the 8 O-addresses as separate args we now send base-address for each O-address octet and pointer
	to local length-8 array array containing the double* index-offsets, and do the output-address
	pointer arithmetic inside the macro:
	*/
	#define SSE2_RADIX8_DIF_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xoff, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
	/* i1 is base byte-offset, no need to lshift it prior to add: */\
		"xorq	%%r8,%%r8	\n\t	leaq	%c[i1](%%r8),%%r8	\n\t"/* movq|movslq of literal %c[i1] both segfaulted, workaround via LEA */\
		"movq	%[in0],%%rax		\n\t			leaq	(%%rax,%%r8,4),%%r10		\n\t"/* [lcol,rcol] base-addresses = in0 + [0,4*istride] */\
		"movq	%[twid_ptrs],%%rsi	\n\t			leaq	(%%r10,%%r8  ),%%r11		\n\t"\
		/* The twid_ptrs[] array holds ptrs to 14 complex twiddles in BR order: (c,s)[4,2,6,1,5,3,7]: */\
		"				movq	0x30(%%rsi),%%r12	\n\t	movq	0x40(%%rsi),%%r14	\n\t"/* c1,c5 */\
		"				movq	0x38(%%rsi),%%r13	\n\t	movq	0x48(%%rsi),%%r15	\n\t"/* s1,s5 */\
		"											movaps	    (%%r10)	,%%xmm8 	\n\t"\
		"leaq	(%%rax,%%r8  ),%%rbx	\n\t		movaps	0x10(%%r10)	,%%xmm10	\n\t"\
		"movq	    (%%rsi),%%rcx	\n\t"/* c4 */"	movaps		%%xmm8 ,%%xmm9 	\n\t"\
		"movq	0x08(%%rsi),%%rsi	\n\t"/* s4 */"	movaps		%%xmm10,%%xmm11	\n\t"\
	/* [rsi] (and if needed rdi) points to sine components of each sincos pair, which is not really a pair here in terms of relative addressing: */\
		"movaps	    (%%rax)	,%%xmm0		\n\t		mulpd	    (%%r12)	,%%xmm8 	\n\t"\
		"movaps	0x10(%%rax)	,%%xmm1		\n\t		mulpd	    (%%r13)	,%%xmm10	\n\t"\
		"movaps		%%xmm0	,%%xmm6		\n\t		mulpd	    (%%r13)	,%%xmm9 	\n\t"\
		"movaps		%%xmm1	,%%xmm7		\n\t		mulpd	    (%%r12)	,%%xmm11	\n\t"\
		"movaps	    (%%rbx)	,%%xmm2		\n\t		subpd	%%xmm10		,%%xmm8 	\n\t"\
		"movaps	0x10(%%rbx)	,%%xmm3		\n\t		addpd	%%xmm11		,%%xmm9 	\n\t"\
		"movaps		%%xmm2	,%%xmm4		\n\t		movaps	    (%%r11)	,%%xmm10	\n\t"\
		"movaps		%%xmm3	,%%xmm5		\n\t		movaps	0x10(%%r11)	,%%xmm11	\n\t"\
		"mulpd	    (%%rcx)	,%%xmm2		\n\t		movaps		%%xmm10	,%%xmm12	\n\t"\
		"mulpd	    (%%rcx)	,%%xmm3		\n\t		movaps		%%xmm11	,%%xmm13	\n\t"\
		"mulpd	    (%%rsi)	,%%xmm4		\n\t		mulpd	    (%%r14)	,%%xmm10	\n\t"\
		"mulpd	    (%%rsi)	,%%xmm5		\n\t		mulpd	    (%%r15)	,%%xmm11	\n\t"\
		"subpd	%%xmm5		,%%xmm2		\n\t		mulpd	    (%%r15)	,%%xmm12	\n\t"\
		"addpd	%%xmm4		,%%xmm3		\n\t		mulpd	    (%%r14)	,%%xmm13	\n\t"\
		"addpd	%%xmm2		,%%xmm0		\n\t		subpd	%%xmm11		,%%xmm10	\n\t"\
		"addpd	%%xmm3		,%%xmm1		\n\t		addpd	%%xmm13		,%%xmm12	\n\t"\
		"subpd	%%xmm2		,%%xmm6		\n\t		movaps	%%xmm10		,%%xmm11	\n\t"\
		"subpd	%%xmm3		,%%xmm7		\n\t		movaps	%%xmm12		,%%xmm13	\n\t"\
		"movaps	%%xmm0	,    (%%rax)	\n\t		addpd	%%xmm8 	,%%xmm10		\n\t"\
		"movaps	%%xmm1	,0x10(%%rax)	\n\t		subpd	%%xmm11	,%%xmm8 		\n\t"\
		"movaps	%%xmm6	,    (%%rbx)	\n\t		addpd	%%xmm9 	,%%xmm12		\n\t"\
		"movaps	%%xmm7	,0x10(%%rbx)	\n\t		subpd	%%xmm13	,%%xmm9 		\n\t"\
		"leaq	(%%rax,%%r8,2),%%rax	\n\t		movaps	%%xmm10	,    (%%r10)	\n\t"\
		"leaq	(%%rbx,%%r8,2),%%rbx	\n\t		movaps	%%xmm12	,0x10(%%r10)	\n\t"\
		"movq	%[twid_ptrs],%%r15	\n\t			leaq	(%%r10,%%r8  ),%%r11		\n\t"\
		"movq 0x10(%%r15),%%rcx \n\t movq 0x18(%%r15),%%rsi	\n\t	movaps	%%xmm8,    (%%r11)	\n\t"/* c2,s2 */\
		"movq 0x20(%%r15),%%rdx \n\t movq 0x28(%%r15),%%rdi	\n\t	movaps	%%xmm9,0x10(%%r11)	\n\t"/* c6,s6 */\
		"movaps	    (%%rax)	,%%xmm0		\n\t		leaq	(%%r10,%%r8,2),%%r10	\n\t"\
		"movaps	0x10(%%rax)	,%%xmm2		\n\t		leaq	(%%r11,%%r8,2),%%r11	\n\t"\
		"movaps		%%xmm0	,%%xmm1		\n\t	movq 0x50(%%r15),%%r12 \n\t movq 0x58(%%r15),%%r13	\n\t"/* c3,s3 */\
		"movaps		%%xmm2	,%%xmm3		\n\t	movq 0x60(%%r15),%%r14 \n\t movq 0x68(%%r15),%%r15	\n\t"/* c7,s7 */\
		"mulpd	    (%%rcx)	,%%xmm0		\n\t		movaps	    (%%r10)	,%%xmm8 	\n\t"\
		"mulpd	    (%%rsi)	,%%xmm2		\n\t		movaps	0x10(%%r10)	,%%xmm10	\n\t"\
		"mulpd	    (%%rsi)	,%%xmm1		\n\t		movaps		%%xmm8	,%%xmm9 	\n\t"\
		"mulpd	    (%%rcx)	,%%xmm3		\n\t		movaps		%%xmm10	,%%xmm11	\n\t"\
		"subpd	%%xmm2		,%%xmm0		\n\t		mulpd	    (%%r12)	,%%xmm8 	\n\t"\
		"addpd	%%xmm3		,%%xmm1		\n\t		mulpd	    (%%r13)	,%%xmm10	\n\t"\
		"movaps	    (%%rbx)	,%%xmm2		\n\t		mulpd	    (%%r13)	,%%xmm9 	\n\t"\
		"movaps	0x10(%%rbx)	,%%xmm3		\n\t		mulpd	    (%%r12)	,%%xmm11	\n\t"\
		"movaps		%%xmm2	,%%xmm4		\n\t		subpd	%%xmm10		,%%xmm8 	\n\t"\
		"movaps		%%xmm3	,%%xmm5		\n\t		addpd	%%xmm11		,%%xmm9 	\n\t"\
		"mulpd	    (%%rdx)	,%%xmm2		\n\t		movaps	    (%%r11)	,%%xmm10	\n\t"\
		"mulpd	    (%%rdi)	,%%xmm3		\n\t		movaps	0x10(%%r11)	,%%xmm11	\n\t"\
		"mulpd	    (%%rdi)	,%%xmm4		\n\t		movaps		%%xmm10	,%%xmm12	\n\t"\
		"mulpd	    (%%rdx)	,%%xmm5		\n\t		movaps		%%xmm11	,%%xmm13	\n\t"\
		"subpd	%%xmm3		,%%xmm2		\n\t		mulpd	    (%%r14)	,%%xmm10	\n\t"\
		"addpd	%%xmm5		,%%xmm4		\n\t		mulpd	    (%%r15)	,%%xmm11	\n\t"\
		"movaps	%%xmm2		,%%xmm3		\n\t		mulpd	    (%%r15)	,%%xmm12	\n\t"\
		"movaps	%%xmm4		,%%xmm5		\n\t		mulpd	    (%%r14)	,%%xmm13	\n\t"\
		"addpd	%%xmm0		,%%xmm2		\n\t		subpd	%%xmm11		,%%xmm10	\n\t"\
		"subpd	%%xmm3		,%%xmm0		\n\t		addpd	%%xmm13		,%%xmm12	\n\t"\
		"addpd	%%xmm1		,%%xmm4		\n\t		movaps	%%xmm10		,%%xmm11	\n\t"\
		"subpd	%%xmm5		,%%xmm1		\n\t		movaps	%%xmm12		,%%xmm13	\n\t"\
		"movaps	%%xmm2	,    (%%rax)	\n\t		addpd	%%xmm8 		,%%xmm10	\n\t"\
		"movaps	%%xmm4	,0x10(%%rax)	\n\t		subpd	%%xmm11		,%%xmm8 	\n\t"\
		"movaps	%%xmm0	,    (%%rbx)	\n\t		addpd	%%xmm9 		,%%xmm12	\n\t"\
		"movaps	%%xmm1	,0x10(%%rbx)	\n\t		subpd	%%xmm13		,%%xmm9 	\n\t"\
		"											movaps	%%xmm10	,    (%%r10)	\n\t"\
		"											movaps	%%xmm12	,0x10(%%r10)	\n\t"\
		"											movaps	%%xmm8 	,    (%%r11)	\n\t"\
		"											movaps	%%xmm9 	,0x10(%%r11)	\n\t"\
	/* combine to get 2 length-4 output subtransforms.
	In this step 2 of the 8-dft, we need address-pairs
		lcol:		rcol:
		i0,2,1,3	i4,6,5,7
		o0,2,1,3	o4,6,5,7
	At this point r[a|b]x have i2,3, r1[0|1] have i6,7, but cleaner to reload add0 and go from there.
	Since we will be loading o-addresses into regs starting with r[a|b]x and r1[0|1], use r[c|d]x and r1[2|3]
	for the I-address pairs here: */\
		"movq	%[in0],%%rcx			\n\t		leaq	(%%rcx,%%r8  ),%%r12	\n\t"/* [lcol,rcol] base-addresses = in0 + [0,1*istride] */\
		"leaq	(%%rcx,%%r8,2),%%rdx	\n\t		leaq	(%%r12,%%r8,2),%%r13	\n\t"/* in0 + [2,3*istride] */\
		"movaps	    (%%rcx)	,%%xmm0		\n\t		movaps	    (%%r12)	,%%xmm8 	\n\t"\
		"movaps	0x10(%%rcx)	,%%xmm1		\n\t		movaps	0x10(%%r12)	,%%xmm9 	\n\t"\
		"movaps	%%xmm0		,%%xmm4		\n\t		movaps	%%xmm8 		,%%xmm12	\n\t"\
		"movaps	%%xmm1		,%%xmm5		\n\t		movaps	%%xmm9 		,%%xmm13	\n\t"\
		"movq	%[out0]	,%%rsi			\n\t	movq	%[off]	,%%rdi			\n\t"/* Load output base-address into rsi and offset-array pointer into rdi */\
		"movslq		    (%%rdi),%%rax	\n\t	movslq		0x10(%%rdi),%%r10	\n\t"/*        off[0,4] */\
		"leaq	(%%rsi,%%rax,8),%%rax	\n\t	leaq	(%%rsi,%%r10,8),%%r10	\n\t"/* out0 + off[0,4] */\
		"addpd	    (%%rdx)	,%%xmm0		\n\t		subpd	0x10(%%r13)	,%%xmm8 	\n\t"\
		"subpd	    (%%rdx)	,%%xmm4		\n\t		addpd	0x10(%%r13)	,%%xmm12	\n\t"\
		"addpd	0x10(%%rdx)	,%%xmm1		\n\t		addpd	    (%%r13)	,%%xmm9 	\n\t"\
		"subpd	0x10(%%rdx)	,%%xmm5		\n\t		subpd	    (%%r13)	,%%xmm13	\n\t"\
		"movslq		0x08(%%rdi),%%rbx	\n\t	movslq		0x18(%%rdi),%%r11	\n\t"\
		"leaq	(%%rsi,%%rbx,8),%%rbx	\n\t	leaq	(%%rsi,%%r11,8),%%r11	\n\t"/* out0 + off[2,6] */\
		"movaps	%%xmm0,    (%%rax)		\n\t		movaps	%%xmm8 	,    (%%r10)	\n\t"\
		"movaps	%%xmm1,0x10(%%rax)		\n\t		movaps	%%xmm9 	,0x10(%%r10)	\n\t"\
		"movaps	%%xmm4,    (%%rbx)		\n\t		movaps	%%xmm12	,    (%%r11)	\n\t"\
		"movaps	%%xmm5,0x10(%%rbx)		\n\t		movaps	%%xmm13	,0x10(%%r11)	\n\t"\
		"leaq	(%%rcx,%%r8,4),%%rcx	\n\t		leaq	(%%r12,%%r8,4),%%r12	\n\t"/* in0 + [4,5*istride] */\
		"leaq	(%%rdx,%%r8,4),%%rdx	\n\t		leaq	(%%r13,%%r8,4),%%r13	\n\t"/* in0 + [6,7*istride] */\
		"movaps	    (%%rcx)	,%%xmm2		\n\t		movaps	    (%%r12)	,%%xmm10	\n\t"\
		"movaps	0x10(%%rcx)	,%%xmm3		\n\t		movaps	0x10(%%r12)	,%%xmm11	\n\t"\
		"movslq		0x04(%%rdi),%%rcx	\n\t	movslq		0x14(%%rdi),%%r12	\n\t"\
		"leaq	(%%rsi,%%rcx,8),%%rcx	\n\t	leaq	(%%rsi,%%r12,8),%%r12	\n\t"/* out0 + off[1,5] */\
		"movaps	%%xmm2		,%%xmm6		\n\t		movaps	%%xmm10		,%%xmm14	\n\t"\
		"movaps	%%xmm3		,%%xmm7		\n\t		movaps	%%xmm11		,%%xmm15	\n\t"\
		"addpd	    (%%rdx)	,%%xmm2		\n\t		subpd	0x10(%%r13)	,%%xmm10	\n\t"\
		"subpd	    (%%rdx)	,%%xmm6		\n\t		addpd	0x10(%%r13)	,%%xmm14	\n\t"\
		"addpd	0x10(%%rdx)	,%%xmm3		\n\t		addpd	    (%%r13)	,%%xmm11	\n\t"\
		"subpd	0x10(%%rdx)	,%%xmm7		\n\t		subpd	    (%%r13)	,%%xmm15	\n\t"\
		"movslq		0x0c(%%rdi),%%rdx	\n\t	movslq		0x1c(%%rdi),%%r13	\n\t"\
		"leaq	(%%rsi,%%rdx,8),%%rdx	\n\t	leaq	(%%rsi,%%r13,8),%%r13	\n\t"/* out0 + off[3,7] */\
		"subpd	%%xmm2		,%%xmm0		\n\t		movaps	%%xmm12	,    (%%r13)	\n\t"\
		"subpd	%%xmm3		,%%xmm1		\n\t		movaps	%%xmm13	,0x10(%%r13)	\n\t"\
	/* Use the cosine term of the [c1,s1] pair, which is the *middle* [4th of 7] of our 7 input pairs, in terms \
	of the input-arg bit-reversal reordering defined in the __X[c,s] --> [c,s] mapping below and happens to \
	always in fact *be* a true cosine term, which is a requirement for our "decr 1 gives isrt2" data-copy scheme: */\
		"movq	%[twid_ptrs],%%r15		\n\t	movq	0x30(%%r15),%%r14	\n\t"\
		"subpd	%%xmm7		,%%xmm4		\n\t	subq	$0x10,%%r14			\n\t"/* isrt2 in [c1]-1 */\
		"subpd	%%xmm6		,%%xmm5		\n\t		movaps	%%xmm10		,%%xmm13	\n\t"\
		"addpd	    (%%rax)	,%%xmm2		\n\t		subpd	%%xmm11		,%%xmm10	\n\t"\
		"addpd	0x10(%%rax)	,%%xmm3		\n\t		addpd	%%xmm11		,%%xmm13	\n\t"\
		"addpd	    (%%rbx)	,%%xmm7		\n\t		mulpd	    (%%r14)	,%%xmm10	\n\t"\
		"addpd	0x10(%%rbx)	,%%xmm6		\n\t		mulpd	    (%%r14)	,%%xmm13	\n\t"\
		"movaps	%%xmm2,    (%%rax) /* [o0].r */\n\t	movaps	0x10(%%r13)	,%%xmm11	\n\t"\
		"movaps	%%xmm3,0x10(%%rax) /* [o0].i */\n\t	movaps	%%xmm15		,%%xmm12	\n\t"\
		"movaps	%%xmm4,    (%%rbx) /* [o2].r */\n\t	addpd	%%xmm14		,%%xmm12	\n\t"\
		"movaps	%%xmm6,0x10(%%rbx) /* [o2].i */\n\t	subpd	%%xmm14		,%%xmm15	\n\t"\
		"movaps	%%xmm0,    (%%rcx) /* [o1].r */\n\t	mulpd	    (%%r14)	,%%xmm12	\n\t"\
		"movaps	%%xmm1,0x10(%%rcx) /* [o1].i */\n\t	mulpd	    (%%r14)	,%%xmm15	\n\t"\
		"movaps	%%xmm7,    (%%rdx) /* [o3].r */\n\t	movaps		(%%r13)	,%%xmm14	\n\t"\
		"movaps	%%xmm5,0x10(%%rdx) /* [o3].i */\n\t	subpd	%%xmm10		,%%xmm8 	\n\t"\
		"											subpd	%%xmm13		,%%xmm9 	\n\t"\
		"											subpd	%%xmm12		,%%xmm14	\n\t"\
		"											subpd	%%xmm15		,%%xmm11	\n\t"\
		"											addpd	    (%%r10)	,%%xmm10	\n\t"\
		"											addpd	0x10(%%r10)	,%%xmm13	\n\t"\
		"											addpd	    (%%r11)	,%%xmm12	\n\t"\
		"											addpd	0x10(%%r11)	,%%xmm15	\n\t"\
		"											movaps	%%xmm10,    (%%r10) \n\t"/* [o4].r */\
		"											movaps	%%xmm13,0x10(%%r10) \n\t"/* [o4].i */\
		"											movaps	%%xmm14,    (%%r11) \n\t"/* [o6].r */\
		"											movaps	%%xmm11,0x10(%%r11) \n\t"/* [o6].i */\
		"											movaps	%%xmm8 ,    (%%r12) \n\t"/* [o5].r */\
		"											movaps	%%xmm9 ,0x10(%%r12) \n\t"/* [o5].i */\
		"											movaps	%%xmm12,    (%%r13) \n\t"/* [o7].r */\
		"											movaps	%%xmm15,0x10(%%r13) \n\t"/* [o7].i */\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "e" (Xi1)	/* ...except for 'e'-inputs which are literal byte offsets */\
		 ,[out0] "m" (Xout0) /* output-address-octet base pointer */\
		 ,[off] "m" (Xoff)	/* and pointer to uint32 array of 8 double* index offsets */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r10","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

	// SSE2 analog of dft_macro.h::RADIX_08_DIT_TWIDDLE_OOP - Result of sign-flippage and adding separate I/O addressing to
	// radix8_dif_dit_pass_gcc64.h::SSE2_RADIX8_DIF_TWIDDLE. We begin with the DIF macro here because we need a pre-twiddles
	// implementation for our purposes, whereas SSE2_RADIX8_DIT_TWIDDLE is post-twiddles.
	//
	// SIMD Opcount: 102 load/store [30 implicit], 66 add/sub, 50 mul. Compare to DFT macros used for radix-8-pass-with-twiddles:
	// DIF opcount : 140 load/store [56 implicit], 66 add/sub, 32 mul
	// DIT opcount :  85 load/store [36 implicit], 68 add/sub, 32 mul .
	//
	/* Dec 2020: Needed to cut #args in defs of SSE2_RADIX8_DIF_TWIDDLE_OOP and SSE2_RADIX8_DIT_TWIDDLE_OOP
	from 30 to < 24 for Apple M1/Clang builds on Arm64 - do similar on x86 to avoid multiple versions of the
	macro having different arglists.
	The DIT version uses fixed-multiple pointer-offsets for both in-and-output address octets,
	because it is only ever used with small local data arrays as IOs, not the main residue array with its wide
	strides and index-padding scheme. So #def the needed basic strides in the 'calling' routine, then:
	I-addresses are in-order, i.e. what were formerly pointers [i0-7] are now computed as
		in0  + {0,i[1-7]} =  in0 + {0,i[1,2,2+2, 4,4+1,4+2,4+1+2]}
	O-addresses are BRed, i.e. what were formerly pointers [o0-7] are now computed as
		out0 + {0,o_off[4,2,6,1,5,3,7]} .
	*/
	#define SSE2_RADIX8_DIT_TWIDDLE_OOP(Xin0,Xi1, Xout0,Xo_off, Xtwid_ptrs, Xtwo)\
	{\
	__asm__ volatile (\
	/* i1 is base byte-offset, no need to lshift it prior to add: */\
		"xorq	%%r8,%%r8	\n\t	leaq	%c[i1](%%r8),%%r8	\n\t"/* movq|movslq of literal %c[i1] both segfaulted, workaround via LEA */\
		/* The twid_ptrs[] array holds ptrs to 14 complex twiddles in-order: (c,s)[1,2,3,4,5,6,7]: */\
		"movq	%[twid_ptrs],%%r14	\n\t"\
	/* Block 0/1 has just one twiddle-CMUL: */\
		"movq		%[in0],%%rax		\n\t"\
		"leaq	(%%rax,%%r8),%%rbx		\n\t"\
		"movq	    (%%r14),%%rdi		\n\t"/* [c1,s1]: [rdi,rsi] point to [cos,sin] components of each sincos pair, */\
		"movq	0x08(%%r14),%%rsi		\n\t"/* which is not really a pair here in terms of relative addressing: */\
		"movaps		    (%%rbx),%%xmm4 	\n\t	movaps		0x10(%%rbx),%%xmm5 	\n\t"/* _r4  = __tr1;	_r5  = __ti1; */\
		"movaps		    (%%rax),%%xmm0 	\n\t	movaps		0x10(%%rax),%%xmm1 	\n\t"/* _r0  = __tr0;	_r1  = __ti0; */\
		"movaps		%%xmm5 ,%%xmm6 		\n\t	movaps		%%xmm4 ,%%xmm7 		\n\t"/* _r6  = _r5;		_r7  = _r4;		** [r4,r5] = CMUL(__t1,__W1): */\
		"mulpd		(%%rdi),%%xmm4 		\n\t	mulpd		(%%rdi),%%xmm5 		\n\t"/* _r4 *= __Wr1;	_r5 *= __Wr1; */\
		"mulpd		(%%rsi),%%xmm6 		\n\t	mulpd		(%%rsi),%%xmm7 		\n\t"/* _r6 *= __Wi1;	_r7 *= __Wi1; */\
		"addpd		%%xmm6 ,%%xmm4 		\n\t	subpd		%%xmm7 ,%%xmm5 		\n\t"/* _r4 += _r6;		_r5 -= _r7; */\
		"movaps		%%xmm0 ,%%xmm2 		\n\t	movaps		%%xmm1 ,%%xmm3 		\n\t"/* _r2  = _r0;		_r3  = _r1; */\
		"addpd		%%xmm4 ,%%xmm0 		\n\t	addpd		%%xmm5 ,%%xmm1 		\n\t"/* _r0 += _r4;		_r1 += _r5; */\
		"subpd		%%xmm4 ,%%xmm2 		\n\t	subpd		%%xmm5 ,%%xmm3 		\n\t"/* _r2 -= _r4;		_r3 -= _r5; */\
		"movaps		%%xmm0 ,    (%%rax)	\n\t	movaps		%%xmm1 ,0x10(%%rax)	\n\t"/* __tr0 = _r0;	__ti0 = _r1; */\
		"movaps		%%xmm2 ,    (%%rbx)	\n\t	movaps		%%xmm3 ,0x10(%%rbx)	\n\t"/* __tr1 = _r2;	__ti1 = _r3; */\
	/* Blocks 2/3 use separate register subset, can be done overlapped with 0/1: */\
		"leaq	(%%rax,%%r8,2),%%rcx	\n\t"\
		"movq	0x10(%%r14),%%r10		\n\t"/* [c2,s2] */\
		"movq	0x18(%%r14),%%r11		\n\t"/* [r8,r9] = CMUL(__t2,__W2): */\
		"movaps		    (%%rcx),%%xmm8 	\n\t	movaps		0x10(%%rcx),%%xmm9 	\n\t"/* _r8  = __tr2;	_r9  = __ti2; */\
		"movaps		%%xmm9 ,%%xmm10		\n\t	movaps		%%xmm8 ,%%xmm11		\n\t"/* _ra  = _r9;		_rb  = _r8; */\
		"mulpd		(%%r10),%%xmm8 		\n\t	mulpd		(%%r10),%%xmm9 		\n\t"/* _r8 *= __Wr2;	_r9 *= __Wr2; */\
		"mulpd		(%%r11),%%xmm10		\n\t	mulpd		(%%r11),%%xmm11		\n\t"/* _ra *= __Wi2;	_rb *= __Wi2; */\
		"addpd		%%xmm10,%%xmm8 		\n\t	subpd		%%xmm11,%%xmm9 		\n\t"/* _r8 += _ra;		_r9 -= _rb; */\
		"leaq	(%%rcx,%%r8),%%rdx		\n\t"\
		"movq	0x20(%%r14),%%r12		\n\t"/* [c3,s3] */\
		"movq	0x28(%%r14),%%r13		\n\t"/* [rc,rd] = CMUL(__t3,__W3): */\
		"movaps		    (%%rdx),%%xmm12	\n\t	movaps		0x10(%%rdx),%%xmm13	\n\t"/* _rc  = __tr3;	_rd  = __ti3; */\
		"movaps		%%xmm13,%%xmm14		\n\t	movaps		%%xmm12,%%xmm15		\n\t"/* _re  = _rd;		_rf  = _rc; */\
		"mulpd		(%%r12),%%xmm12		\n\t	mulpd		(%%r12),%%xmm13		\n\t"/* _rc *= __Wr3;	_rd *= __Wr3; */\
		"mulpd		(%%r13),%%xmm14		\n\t	mulpd		(%%r13),%%xmm15		\n\t"/* _re *= __Wi3;	_rf *= __Wi3; */\
		"addpd		%%xmm14,%%xmm12		\n\t	subpd		%%xmm15,%%xmm13		\n\t"/* _rc += _re;		_rd -= _rf; */\
		/* Now do radix-2 butterfly: */\
		"movaps		%%xmm8 ,%%xmm10		\n\t	movaps		%%xmm9 ,%%xmm11		\n\t"/* _ra  = _r8;		_rb  = _r9; */\
		"addpd		%%xmm12,%%xmm8 		\n\t	addpd		%%xmm13,%%xmm9 		\n\t"/* _r8 += _rc;		_r9 += _rd; */\
		"subpd		%%xmm12,%%xmm10		\n\t	subpd		%%xmm13,%%xmm11		\n\t"/* _ra -= _rc;		_rb -= _rd; */\
		"movaps		%%xmm8 ,    (%%rcx)	\n\t	movaps		%%xmm9 ,0x10(%%rcx)	\n\t"/* __tr2 = _r8;	__ti2 = _r9; */\
		"movaps		%%xmm10,    (%%rdx)	\n\t	movaps		%%xmm11,0x10(%%rdx)	\n\t"/* __tr3 = _ra;	__ti3 = _rb; */\
	/* Blocks 4/5: */\
		"shlq	$2,%%r8			\n\t"/* From here on only need offset i4 = 4*i1 */\
		"addq	%%r8,%%rax		\n\t"/* Remaining 4 I-address-calculations are in-place += i4, so use ADD, faster than LEA */\
		"movq	0x30(%%r14),%%rdi		\n\t"/* [c4,s4] */\
		"movq	0x38(%%r14),%%rsi		\n\t"/* [r0,r1] = CMUL(__t4,__W4): */\
		"movaps		    (%%rax),%%xmm0 	\n\t	movaps		0x10(%%rax),%%xmm1 	\n\t"/* _r0  = __tr4;	_r1  = __ti4; */\
		"movaps		%%xmm1 ,%%xmm2 		\n\t	movaps		%%xmm0 ,%%xmm3 		\n\t"/* _r2  = _r1;		_r3  = _r0; */\
		"mulpd		(%%rdi),%%xmm0 		\n\t	mulpd		(%%rdi),%%xmm1 		\n\t"/* _r0 *= __Wr4;	_r1 *= __Wr4; */\
		"mulpd		(%%rsi),%%xmm2 		\n\t	mulpd		(%%rsi),%%xmm3 		\n\t"/* _r2 *= __Wi4;	_r3 *= __Wi4; */\
		"addpd		%%xmm2 ,%%xmm0 		\n\t	subpd		%%xmm3 ,%%xmm1 		\n\t"/* _r0 += _r2;		_r1 -= _r3; */\
		"addq	%%r8,%%rbx		\n\t"\
		"movq	0x40(%%r14),%%r10		\n\t"/* [c5,s5] */\
		"movq	0x48(%%r14),%%r11		\n\t"/* [r4,r5] = CMUL(__t5,__W5): */\
		"movaps		    (%%rbx),%%xmm4 	\n\t	movaps		0x10(%%rbx),%%xmm5 	\n\t"/* _r4  = __tr5;	_r5  = __ti5; */\
		"movaps		%%xmm5 ,%%xmm6 		\n\t	movaps		%%xmm4 ,%%xmm7 		\n\t"/* _r6  = _r5;		_r7  = _r4; */\
		"mulpd		(%%r10),%%xmm4 		\n\t	mulpd		(%%r10),%%xmm5 		\n\t"/* _r4 *= __Wr5;	_r5 *= __Wr5; */\
		"mulpd		(%%r11),%%xmm6 		\n\t	mulpd		(%%r11),%%xmm7 		\n\t"/* _r6 *= __Wi5;	_r7 *= __Wi5; */\
		"addpd		%%xmm6 ,%%xmm4 		\n\t	subpd		%%xmm7 ,%%xmm5 		\n\t"/* _r4 += _r6;		_r5 -= _r7; */\
		/* Now do radix-2 butterfly: */\
		"movaps		%%xmm0 ,%%xmm2 		\n\t	movaps		%%xmm1 ,%%xmm3 		\n\t"/* _r2  = _r0;		_r3  = _r1; */\
		"addpd		%%xmm4 ,%%xmm0 		\n\t	addpd		%%xmm5 ,%%xmm1 		\n\t"/* _r0 += _r4;		_r1 += _r5; */\
		"subpd		%%xmm4 ,%%xmm2 		\n\t	subpd		%%xmm5 ,%%xmm3 		\n\t"/* _r2 -= _r4;		_r3 -= _r5; */\
	/* Blocks 6/7 use separate register subset, can be done overlapped with 4/5: */\
		"addq	%%r8,%%rcx		\n\t"\
		"movq	0x50(%%r14),%%r10		\n\t"/* [c6,s6] */\
		"movq	0x58(%%r14),%%r11		\n\t"/* [r8,r9] = CMUL(__t6,__W6): */\
		"movaps		    (%%rcx),%%xmm8 	\n\t	movaps		0x10(%%rcx),%%xmm9 	\n\t"/* _r8  = __tr6;	_r9  = __ti6; */\
		"movaps		%%xmm9 ,%%xmm10		\n\t	movaps		%%xmm8 ,%%xmm11		\n\t"/* _ra  = _r9;		_rb  = _r8; */\
		"mulpd		(%%r10),%%xmm8 		\n\t	mulpd		(%%r10),%%xmm9 		\n\t"/* _r8 *= __Wr6;	_r9 *= __Wr6; */\
		"mulpd		(%%r11),%%xmm10		\n\t	mulpd		(%%r11),%%xmm11		\n\t"/* _ra *= __Wi6;	_rb *= __Wi6; */\
		"addpd		%%xmm10,%%xmm8 		\n\t	subpd		%%xmm11,%%xmm9 		\n\t"/* _r8 += _ra;		_r9 -= _rb; */\
		"addq	%%r8,%%rdx		\n\t"\
		"movq	0x60(%%r14),%%r12		\n\t"/* [c7,s7] */\
		"movq	0x68(%%r14),%%r13		\n\t"/* [rc,rd] = CMUL(__t7,__W7): */\
		"movaps		    (%%rdx),%%xmm12	\n\t	movaps		0x10(%%rdx),%%xmm13	\n\t"/* _rc  = __tr7;	_rd  = __ti7; */\
		"movaps		%%xmm13,%%xmm14		\n\t	movaps		%%xmm12,%%xmm15		\n\t"/* _re  = _rd;		_rf  = _rc; */\
		"mulpd		(%%r12),%%xmm12		\n\t	mulpd		(%%r12),%%xmm13		\n\t"/* _rc *= __Wr7;	_rd *= __Wr7; */\
		"mulpd		(%%r13),%%xmm14		\n\t	mulpd		(%%r13),%%xmm15		\n\t"/* _re *= __Wi7;	_rf *= __Wi7; */\
		"addpd		%%xmm14,%%xmm12		\n\t	subpd		%%xmm15,%%xmm13		\n\t"/* _rc += _re;		_rd -= _rf; */\
		/* Now do radix-2 butterfly: */\
		"movaps		%%xmm8 ,%%xmm10		\n\t	movaps		%%xmm9 ,%%xmm11		\n\t"/* _ra  = _r8;		_rb  = _r9; */\
		"addpd		%%xmm12,%%xmm8 		\n\t	addpd		%%xmm13,%%xmm9 		\n\t"/* _r8 += _rc;		_r9 += _rd; */\
		"subpd		%%xmm12,%%xmm10		\n\t	subpd		%%xmm13,%%xmm11		\n\t"/* _ra -= _rc;		_rb -= _rd; */\
	/* Reload Block 0-3 outputs into r4-7,c-f, combine to get the 2 length-4 subtransform... */\
		"subq		%%r8,%%rax			\n\t"\
		"subq		%%r8,%%rbx			\n\t"\
		"subq		%%r8,%%rcx			\n\t"\
		"subq		%%r8,%%rdx			\n\t"\
		"movaps		    (%%rax),%%xmm4 	\n\t	movaps		0x10(%%rax),%%xmm5 	\n\t"/* _r4 = __tr0;	_r5 = __ti0; */\
		"movaps		    (%%rbx),%%xmm6 	\n\t	movaps		0x10(%%rbx),%%xmm7 	\n\t"/* _r6 = __tr1;	_r7 = __ti1; */\
		"movaps		    (%%rcx),%%xmm12	\n\t	movaps		0x10(%%rcx),%%xmm13	\n\t"/* _rc = __tr2;	_rd = __ti2; */\
		"movaps		    (%%rdx),%%xmm14	\n\t	movaps		0x10(%%rdx),%%xmm15	\n\t"/* _re = __tr3;	_rf = __ti3; */\
	"movq		%[out0],%%rax			\n\t	movq		%[o_off],%%r8		\n\t"/* out0, off1 */\
	"movq		%[two],%%rsi			\n\t	leaq		(%%r8,%%r8),%%r9	\n\t"/* (vec_dbl)2.0, off2 */\
		"										leaq		(%%r9,%%r9),%%r10	\n\t"/* off4 */\
		"subpd		%%xmm12,%%xmm4 		\n\t	subpd		%%xmm13,%%xmm5 		\n\t"/* _r4 -= _rc;		_r5 -= _rd; */\
		"subpd		%%xmm15,%%xmm6 		\n\t	subpd		%%xmm14,%%xmm7 		\n\t"/* _r6 -= _rf;		_r7 -= _re; */\
		"subpd		%%xmm8 ,%%xmm0 		\n\t	subpd		%%xmm9 ,%%xmm1 		\n\t"/* _r0 -= _r8;		_r1 -= _r9; */\
		"subpd		%%xmm11,%%xmm2 		\n\t	subpd		%%xmm10,%%xmm3 		\n\t"/* _r2 -= _rb;		_r3 -= _ra; */\
		/* We hope the microcode execution engine sticks the datum at (%%rsi) into a virtual register and inlines the MULs with the above SUBs: */\
		"mulpd		(%%rsi),%%xmm12		\n\t	mulpd		(%%rsi),%%xmm13		\n\t"/* _rc *= _two;	_rd *= _two; */\
		"mulpd		(%%rsi),%%xmm15		\n\t	mulpd		(%%rsi),%%xmm14		\n\t"/* _rf *= _two;	_re *= _two; */\
		"mulpd		(%%rsi),%%xmm8 		\n\t	mulpd		(%%rsi),%%xmm9 		\n\t"/* _r8 *= _two;	_r9 *= _two; */\
		"mulpd		(%%rsi),%%xmm11		\n\t	mulpd		(%%rsi),%%xmm10		\n\t"/* _rb *= _two;	_ra *= _two; */\
		"addpd		%%xmm4 ,%%xmm12		\n\t	addpd		%%xmm5 ,%%xmm13		\n\t"/* _rc += _r4;		_rd += _r5; */\
		"addpd		%%xmm6 ,%%xmm15		\n\t	addpd		%%xmm7 ,%%xmm14		\n\t"/* _rf += _r6;		_re += _r7; */\
		"addpd		%%xmm0 ,%%xmm8 		\n\t	addpd		%%xmm1 ,%%xmm9 		\n\t"/* _r8 += _r0;		_r9 += _r1; */\
		"addpd		%%xmm2 ,%%xmm11		\n\t	addpd		%%xmm3 ,%%xmm10		\n\t"/* _rb += _r2;		_ra += _r3; */\
		/* In terms of our original scalar-code prototyping macro, the data are: __tr0 = _r[c,f,4,6,8,b,0,2], __ti0 = _r[d,7,5,e,9,3,1,a]; */\
	/* Now combine the two half-transforms: */\
		/* Need r2/3 +- a/b combos for the *ISRT2 preceding the output 4-7 radix-2 butterflies, so start them first: */\
		"subpd		%%xmm3 ,%%xmm11		\n\t	subpd		%%xmm10,%%xmm2 		\n\t"/* _rb -= _r3;		_r2 -= _ra; */\
		"subpd		%%xmm8 ,%%xmm12		\n\t	subpd		%%xmm9 ,%%xmm13		\n\t"/* _rc -= _r8;		_rd -= _r9; */\
		"subpd		%%xmm1 ,%%xmm4 		\n\t	subpd		%%xmm0 ,%%xmm5 		\n\t"/* _r4 -= _r1;		_r5 -= _r0; */\
		"mulpd		(%%rsi),%%xmm3 		\n\t	mulpd		(%%rsi),%%xmm10		\n\t"/* _r3 *= _two;	_ra *= _two; */\
		"mulpd		(%%rsi),%%xmm8 		\n\t	mulpd		(%%rsi),%%xmm9 		\n\t"/* _r8 *= _two;	_r9 *= _two; */\
		"mulpd		(%%rsi),%%xmm1 		\n\t	mulpd		(%%rsi),%%xmm0 		\n\t"/* _r1 *= _two;	_r0 *= _two; */\
		"addpd		%%xmm11,%%xmm3 		\n\t	addpd		%%xmm2 ,%%xmm10		\n\t"/* _r3 += _rb;		_ra += _r2; */\
		"addpd		%%xmm12,%%xmm8 		\n\t	addpd		%%xmm13,%%xmm9 		\n\t"/* _r8 += _rc;		_r9 += _rd; */\
		"addpd		%%xmm4 ,%%xmm1 		\n\t	addpd		%%xmm5 ,%%xmm0 		\n\t"/* _r1 += _r4;		_r0 += _r5; */\
		/*movq		%[o0],%%rax		[o0] already in rax */	\
		"leaq	(%%rax,%%r9 ),%%rcx		\n\t"/* out0 + off2, compute first to allow time for LEA to finish before += off4 to get out0 + off6 */\
		"leaq	(%%rax,%%r10),%%rbx		\n\t"/* out0 + off4 */\
		"leaq	(%%rcx,%%r10),%%rdx		\n\t"/* out0 + off6 */\
		"movaps		%%xmm12,    (%%rbx)	\n\t	movaps		%%xmm13,0x10(%%rbx)	\n\t"/* __Br1 = _rc;	__Bi1 = _rd; */\
		/* Use that _rc,d free to stick 2.0 into _rc and that [c4] in rdi to load ISRT2 from c4-1 into _rd: */\
		"movaps		    (%%rsi),%%xmm12	\n\t	movaps		-0x10(%%rdi),%%xmm13\n\t"/* _rc = 2.0;		_rd = ISRT2; */\
		"movaps		%%xmm4 ,    (%%rdx)	\n\t	movaps		%%xmm0 ,0x10(%%rdx)	\n\t"/* __Br3 = _r4;	__Bi3 = _r0; */\
		"movaps		%%xmm8 ,    (%%rax)	\n\t	movaps		%%xmm9 ,0x10(%%rax)	\n\t"/* __Br0 = _r8;	__Bi0 = _r9; */\
		"movaps		%%xmm1 ,    (%%rcx)	\n\t	movaps		%%xmm5 ,0x10(%%rcx)	\n\t"/* __Br2 = _r1;	__Bi2 = _r5; */\
		"mulpd		%%xmm13,%%xmm3 		\n\t	mulpd		%%xmm13,%%xmm11		\n\t"/* _r3 *= ISRT2;	_rb *= ISRT2; */\
		"mulpd		%%xmm13,%%xmm2 		\n\t	mulpd		%%xmm13,%%xmm10		\n\t"/* _r2 *= ISRT2;	_ra *= ISRT2; */\
		"subpd		%%xmm3 ,%%xmm15		\n\t	subpd		%%xmm11,%%xmm7 		\n\t"/* _rf -= _r3;		_r7 -= _rb; */\
		"subpd		%%xmm2 ,%%xmm6 		\n\t	subpd		%%xmm10,%%xmm14		\n\t"/* _r6 -= _r2;		_re -= _ra; */\
		"mulpd		%%xmm12,%%xmm3 		\n\t	mulpd		%%xmm12,%%xmm11		\n\t"/* _r3 *= _two;	_rb *= _two; */\
		"mulpd		%%xmm12,%%xmm2 		\n\t	mulpd		%%xmm12,%%xmm10		\n\t"/* _r2 *= _two;	_ra *= _two; */\
		"addpd		%%xmm15,%%xmm3 		\n\t	addpd		%%xmm7 ,%%xmm11		\n\t"/* _r3 += _rf;		_rb += _r7; */\
		"addpd		%%xmm6 ,%%xmm2 		\n\t	addpd		%%xmm14,%%xmm10		\n\t"/* _r2 += _r6;		_ra += _re; */\
		"addq		%%r8 ,%%rax			\n\t"/* out0 + off[1,5,3,7] */\
		"addq		%%r8 ,%%rbx			\n\t"\
		"addq		%%r8 ,%%rcx			\n\t"\
		"addq		%%r8 ,%%rdx			\n\t"\
		"movaps		%%xmm3 ,    (%%rax)	\n\t	movaps		%%xmm7 ,0x10(%%rax)	\n\t"/* __Br4 = _r3;	__Bi4 = _r7; */\
		"movaps		%%xmm15,    (%%rbx)	\n\t	movaps		%%xmm11,0x10(%%rbx)	\n\t"/* __Br5 = _rf;	__Bi5 = _rb; */\
		"movaps		%%xmm6 ,    (%%rcx)	\n\t	movaps		%%xmm14,0x10(%%rcx)	\n\t"/* __Br6 = _r6;	__Bi6 = _re; */\
		"movaps		%%xmm2 ,    (%%rdx)	\n\t	movaps		%%xmm10,0x10(%%rdx)	\n\t"/* __Br7 = _r2;	__Bi7 = _ra; */\
		:					/* outputs: none */\
		: [in0] "m" (Xin0)	/* All 'm'-inputs from memory addresses here... */\
		 ,[i1] "e" (Xi1)	/* ...except for 'e'-inputs which are literal byte offsets */\
		 ,[out0] "m" (Xout0)\
		 ,[o_off] "m" (Xo_off)/* O-address pointer-stride */\
		 ,[twid_ptrs] "m" (Xtwid_ptrs)\
		 ,[two] "m" (Xtwo)/* Only used in FMA implementations of this macro */\
		: "cc","memory","rax","rbx","rcx","rdx","rsi","rdi","r8","r9","r10","r11","r12","r13","r14","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15"	/* Clobbered registers */\
	);\
	}

#endif	// AVX / SSE2 toggle

#endif	/* sse2_macro_gcc_h_included */

